In [6]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from graphviz import Source
import xgboost as xgb
from xgboost import plot_tree
import seaborn as sns

In [7]:
patient_data = pd.read_csv('../data/archive/actual.csv', index_col = 'patient')
train_data = pd.read_csv('../data/archive/data_set_ALL_AML_train.csv')
test_data = pd.read_csv('../data/archive/data_set_ALL_AML_independent.csv')

In [8]:
# Drop the call collumns from both data sets
call_cols_train = [col for col in train_data.columns if 'call' in col]
train_data = train_data.drop(call_cols_train, axis = 1)

call_cols_test = [col for col in test_data.columns if 'call' in col]
test_data = test_data.drop(call_cols_test, axis = 1)

# Drop "Gene Description" and "Gene Accession Number"
cols_to_drop = ['Gene Description', 'Gene Accession Number']
train_data = train_data.drop(cols_to_drop, axis = 1)
test_data = test_data.drop(cols_to_drop, axis = 1)

# Transpose both data_sets
train_data = train_data.T
test_data = test_data.T

In [9]:
# Merge the test and training data set together
train_data = train_data.replace(np.inf, np.nan)
train_data = train_data.fillna(value = train_data.values.mean())

test_data = test_data.replace(np.inf, np.nan)
test_data = test_data.fillna(value = train_data.values.mean())
train_data.index = pd.to_numeric(train_data.index) 
test_data.index = pd.to_numeric(test_data.index) 
train_data.sort_index(inplace=True) 
test_data.sort_index(inplace=True)

complete_data = train_data.append(test_data) # or train_data.append(test_data, ignore_index = True)
complete_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7119,7120,7121,7122,7123,7124,7125,7126,7127,7128
1,-214,-153,-58,88,-295,-558,199,-176,252,206,...,185,511,-125,389,-37,793,329,36,191,-37
2,-139,-73,-1,283,-264,-400,-330,-168,101,74,...,169,837,-36,442,-17,782,295,11,76,-14
3,-76,-49,-307,309,-376,-650,33,-367,206,-215,...,315,1199,33,168,52,1138,777,41,228,-41
4,-135,-114,265,12,-419,-585,158,-253,49,31,...,240,835,218,174,-110,627,170,-50,126,-91
5,-106,-125,-76,168,-230,-284,4,-122,70,252,...,156,649,57,504,-26,250,314,14,56,-25


In [10]:
complete_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7119,7120,7121,7122,7123,7124,7125,7126,7127,7128
1,-214,-153,-58,88,-295,-558,199,-176,252,206,...,185,511,-125,389,-37,793,329,36,191,-37
2,-139,-73,-1,283,-264,-400,-330,-168,101,74,...,169,837,-36,442,-17,782,295,11,76,-14
3,-76,-49,-307,309,-376,-650,33,-367,206,-215,...,315,1199,33,168,52,1138,777,41,228,-41
4,-135,-114,265,12,-419,-585,158,-253,49,31,...,240,835,218,174,-110,627,170,-50,126,-91
5,-106,-125,-76,168,-230,-284,4,-122,70,252,...,156,649,57,504,-26,250,314,14,56,-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,-154,-136,49,180,-257,-273,141,-123,52,878,...,214,540,13,1075,-45,524,249,40,-68,-1
69,-79,-118,-30,68,-110,-264,-28,-61,40,-217,...,409,617,-34,738,11,742,234,72,109,-30
70,-55,-44,12,129,-108,-301,-222,-133,136,320,...,131,318,35,241,-66,320,174,-4,176,40
71,-59,-114,23,146,-171,-227,-73,-126,-6,149,...,214,760,-38,201,-55,348,208,0,74,-12
