**Required Imports**

In [None]:
import itertools
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd
import scipy

**Loading Datasets**

In [None]:
testfile = 'data_set_ALL_AML_independent.csv'
trainfile = 'data_set_ALL_AML_train.csv'
labelfile = 'actual.csv'

train = pd.read_csv(trainfile)
test = pd.read_csv(testfile)
patient_cancer = pd.read_csv(labelfile)

In [None]:
# Remove "call" columns from training a test dataframes
train_keepers = [col for col in train.columns if "call" not in col]
test_keepers = [col for col in test.columns if "call" not in col]

train = train[train_keepers]
test = test[test_keepers]

# Transpose the columns and rows so that genes become features and rows become observations
train = train.T
test = test.T

# Clean up the column names for training data
train.columns = train.iloc[1]
train = train.drop(["Gene Description", "Gene Accession Number"]).apply(pd.to_numeric)

# Clean up the column names for testing data
test.columns = test.iloc[1]
test = test.drop(["Gene Description", "Gene Accession Number"]).apply(pd.to_numeric)

train.head()

In [6]:
# Reset the index. The indexes of two dataframes need to be the same before you combine them
train = train.reset_index(drop=True)

# Subset the first 38 patient's cancer types
pc_train = patient_cancer[patient_cancer.patient <= 38].reset_index(drop=True)

# Combine dataframes for first 38 patients: Patient number + cancer type + gene expression values
train = pd.concat([pc_train,train], axis=1)


# Handle the test data for patients 38 through 72
# Clean up the index
test = test.reset_index(drop=True)

# Subset the last patient's cancer types to test
pc_test = patient_cancer[patient_cancer.patient > 38].reset_index(drop=True)

# Combine dataframes for last patients: Patient number + cancer type + gene expression values
test = pd.concat([pc_test,test], axis=1)

train

Unnamed: 0,patient,cancer,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
0,1,ALL,-214,-153,-58,88,-295,-558,199,-176,...,185,511,-125,389,-37,793,329,36,191,-37
1,2,ALL,-139,-73,-1,283,-264,-400,-330,-168,...,169,837,-36,442,-17,782,295,11,76,-14
2,3,ALL,-76,-49,-307,309,-376,-650,33,-367,...,315,1199,33,168,52,1138,777,41,228,-41
3,4,ALL,-135,-114,265,12,-419,-585,158,-253,...,240,835,218,174,-110,627,170,-50,126,-91
4,5,ALL,-106,-125,-76,168,-230,-284,4,-122,...,156,649,57,504,-26,250,314,14,56,-25
5,6,ALL,-138,-85,215,71,-272,-558,67,-186,...,115,1221,-76,172,-74,645,341,26,193,-53
6,7,ALL,-72,-144,238,55,-399,-551,131,-179,...,30,819,-178,151,-18,1140,482,10,369,-42
7,8,ALL,-413,-260,7,-2,-541,-790,-275,-463,...,289,629,-86,302,23,1799,446,59,781,20
8,9,ALL,5,-127,106,268,-210,-535,0,-174,...,356,980,6,177,-12,758,385,115,244,-39
9,10,ALL,-88,-105,42,219,-178,-246,328,-148,...,42,986,26,101,21,570,359,9,171,7


In [8]:
train.describe().round()

Unnamed: 0,patient,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
count,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,...,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0
mean,20.0,-121.0,-151.0,-17.0,181.0,-277.0,-439.0,-44.0,-201.0,99.0,...,179.0,751.0,9.0,399.0,-20.0,869.0,336.0,19.0,504.0,-29.0
std,11.0,110.0,76.0,118.0,117.0,111.0,135.0,219.0,91.0,83.0,...,85.0,298.0,77.0,470.0,42.0,482.0,210.0,31.0,729.0,31.0
min,1.0,-476.0,-327.0,-307.0,-36.0,-541.0,-790.0,-479.0,-463.0,-82.0,...,30.0,224.0,-178.0,36.0,-112.0,195.0,41.0,-50.0,-2.0,-94.0
25%,10.0,-139.0,-205.0,-83.0,81.0,-374.0,-547.0,-169.0,-239.0,36.0,...,120.0,576.0,-43.0,174.0,-48.0,595.0,233.0,8.0,136.0,-43.0
50%,20.0,-106.0,-142.0,-44.0,200.0,-263.0,-426.0,-34.0,-186.0,100.0,...,174.0,700.0,10.0,266.0,-18.0,744.0,308.0,20.0,244.0,-26.0
75%,29.0,-68.0,-95.0,47.0,279.0,-189.0,-345.0,79.0,-145.0,152.0,...,232.0,970.0,57.0,452.0,9.0,1112.0,390.0,30.0,487.0,-12.0
max,38.0,17.0,-20.0,265.0,392.0,-51.0,-155.0,419.0,-24.0,283.0,...,356.0,1653.0,218.0,2527.0,52.0,2315.0,1109.0,115.0,3193.0,36.0


In [None]:
trainEDA = train.drop("cancer", axis=1)
trainEDA.plot(kind="hist", legend=None, bins=20, color='k')
trainEDA.plot(kind="kde", legend=None);