In [3]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

#used to display all the results, not only the last ones
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

#Load the datasets
X_train = pd.read_csv("../Desktop/data_set_train.csv")
X_test = pd.read_csv("../Desktop/data_set_test.csv")
y = pd.read_csv("../Desktop/actual.csv", index_col = 'patient', converters={'cancer': lambda x: int(x == 'AML')})

# Drop string columns
# Select all the datatypes except numeric
non_numeric_cols = X_train.select_dtypes(exclude='number')
non_numeric_test_cols = X_test.select_dtypes(exclude='number')

#drop the non-numeric columns
X_train.drop(non_numeric_cols, axis='columns', inplace=True)
X_test.drop(non_numeric_test_cols, axis='columns', inplace=True)

# It seems like this dataset is a mess
# We have the features in rows
# The labels in columns
# And the columns are not ordered
# So we have to do something about this
# I decided to transpose the features
X_train = X_train[ sorted([column for column in X_train.columns[0:]], key = lambda colname: int(colname) ) ]
X_test = X_test[ sorted([column for column in X_test.columns[0:]], key = lambda colname: int(colname) ) ]

# performing preprocessing part
scaler = StandardScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

y_train = y.head(38)
y_test = y.tail(34)

# Components number
N = 25

# Define the Principal Component Analysis model
model = PCA(n_components=N)
# Get the optimal train components
X_train_reduced = model.fit_transform(scaled_X_train)

# Show the train components
df_train_comp = pd.DataFrame(model.components_,index=['Train: for component with number ' + str(i) + ' obtains: ' for i in range(N)], columns=X_train.columns).T
df_train_comp

# Get the optimal test components
X_test_reduced = model.fit_transform(scaled_X_test)

# Show the test components
df_test_comp = pd.DataFrame(model.components_,index=['Test: for component with number ' + str(i) + ' obtains: ' for i in range(N)],columns=X_test.columns).T
df_test_comp


# fitting Logistic Regression To the training set
LR = LogisticRegression().fit(df_train_comp, y_train)

# predicting the test set result using
# predict function under LogisticRegression
pred1 = LR.predict(df_test_comp)
LR_score = LR.score(df_test_comp, y_test)

print("Accuracy Logistic Regression:", metrics.accuracy_score(y_test, pred1))


Unnamed: 0,Train: for component with number 0 obtains:,Train: for component with number 1 obtains:,Train: for component with number 2 obtains:,Train: for component with number 3 obtains:,Train: for component with number 4 obtains:,Train: for component with number 5 obtains:,Train: for component with number 6 obtains:,Train: for component with number 7 obtains:,Train: for component with number 8 obtains:,Train: for component with number 9 obtains:,...,Train: for component with number 15 obtains:,Train: for component with number 16 obtains:,Train: for component with number 17 obtains:,Train: for component with number 18 obtains:,Train: for component with number 19 obtains:,Train: for component with number 20 obtains:,Train: for component with number 21 obtains:,Train: for component with number 22 obtains:,Train: for component with number 23 obtains:,Train: for component with number 24 obtains:
1,0.165652,0.018675,-0.071234,-0.104277,0.250271,-0.088801,-0.043253,-0.129033,-0.029217,-0.064309,...,0.125428,0.082112,0.203281,0.207383,0.0429,-0.092378,0.058163,-0.122635,0.506497,-0.110918
2,0.164671,-0.085388,-0.09213,0.138985,0.037049,0.118546,-0.159783,-0.010365,-0.269516,0.252988,...,0.07706,0.068012,-0.261534,0.023931,0.215523,0.086311,-0.080906,-0.039127,-0.152719,-0.07443
3,0.16134,0.155464,-0.027659,0.297072,0.151235,0.217442,-0.035505,-0.115257,-0.126786,-0.113258,...,-0.099865,-0.134195,0.107677,0.230236,0.03194,-0.09948,-0.260778,0.292002,-0.13278,-0.157557
4,0.16774,0.0661,-0.030515,-0.078863,0.131024,0.02431,0.005855,0.143078,0.105173,-0.113683,...,-0.138534,-0.090506,0.031557,-0.031402,-0.009032,-0.081675,-0.129766,0.044532,-0.014509,-0.193312
5,0.166659,-0.02948,-0.152563,-0.003893,-0.077868,-0.167399,0.147642,0.118774,0.000294,-0.108954,...,-0.16418,-0.266981,-0.124826,-0.223275,0.074649,-0.044985,-0.14907,-0.034277,-0.088002,0.062888
6,0.157367,0.269438,0.145993,0.364372,0.049855,-0.18883,0.010487,-0.173172,0.162024,0.128292,...,0.065644,0.080737,-0.021922,-0.055333,0.112133,-0.130173,-0.069935,0.109279,0.137224,0.010328
7,0.160136,0.228787,0.180562,-0.123815,0.275904,0.149953,-0.02235,-0.014926,0.088756,-0.256084,...,-0.241357,-0.111172,0.141571,0.048274,0.125118,0.188198,0.336463,0.180789,-0.086384,0.22888
8,0.162563,0.057912,0.047411,-0.150753,0.240853,-0.028811,-0.311077,-0.163973,0.251338,0.139208,...,0.150194,0.074182,-0.05617,-0.014023,-0.114762,0.143002,-0.247009,0.126004,-0.050622,0.290143
9,0.15953,0.004517,-0.210713,0.384092,-0.024885,0.053683,-0.038852,-0.024488,-0.118105,-0.194657,...,0.028405,-0.162852,0.111379,0.159985,-0.215242,0.142996,0.150899,0.003981,-0.092316,0.058149
10,0.162921,0.10981,-0.020388,0.299589,-0.183171,-0.06863,0.057264,-0.047546,-0.146249,0.016416,...,-0.17544,0.133128,0.027076,-0.026138,-0.027799,0.083126,0.005545,-0.000697,-0.070885,0.08588


Unnamed: 0,Test: for component with number 0 obtains:,Test: for component with number 1 obtains:,Test: for component with number 2 obtains:,Test: for component with number 3 obtains:,Test: for component with number 4 obtains:,Test: for component with number 5 obtains:,Test: for component with number 6 obtains:,Test: for component with number 7 obtains:,Test: for component with number 8 obtains:,Test: for component with number 9 obtains:,...,Test: for component with number 15 obtains:,Test: for component with number 16 obtains:,Test: for component with number 17 obtains:,Test: for component with number 18 obtains:,Test: for component with number 19 obtains:,Test: for component with number 20 obtains:,Test: for component with number 21 obtains:,Test: for component with number 22 obtains:,Test: for component with number 23 obtains:,Test: for component with number 24 obtains:
39,0.169684,0.025589,-0.166904,0.272765,-0.118741,-0.043765,0.443541,0.193511,0.047569,-0.239187,...,-0.468836,0.098605,0.136219,0.028455,-0.128695,-0.02656,-0.330075,-0.105942,-0.021231,-0.229019
40,0.174814,-0.078532,-0.023739,0.122035,-0.20757,-0.008727,-0.053406,-0.022583,0.117233,0.130419,...,0.110614,-0.117176,0.007264,0.073846,0.153309,0.362643,-0.172489,-0.131074,-0.072404,-0.199242
41,0.177146,-0.033012,-0.037502,-0.153554,-0.278918,-0.020684,-0.037717,0.062928,-0.082302,0.035176,...,0.010221,-0.278057,0.084229,0.222206,0.175972,0.182232,0.087518,0.040962,-0.02625,-0.031716
42,0.17343,-0.098597,-0.15013,0.089203,-0.003687,0.154866,0.137444,-0.222388,-0.05289,-0.396273,...,0.147202,0.123918,-0.351341,-0.063636,0.255546,-0.25759,0.347183,-0.226177,0.003736,-0.049974
43,0.169038,0.188195,0.029009,-0.083596,-0.180177,0.381456,-0.010397,-0.178139,0.200908,-0.434702,...,0.080326,-0.048186,0.132769,0.217654,-0.053979,0.220051,-0.112002,0.079844,0.197013,0.117528
44,0.174595,0.051614,-0.219967,-0.02658,-0.032888,0.052363,0.184129,0.306414,0.052148,0.125816,...,0.16353,-0.240351,0.20127,0.159822,0.255962,-0.256218,-0.022932,0.142991,0.121413,0.080522
45,0.174813,-0.049951,-0.240621,-0.140421,0.000568,-0.065144,0.015244,0.224447,0.072509,0.132544,...,0.061803,0.312932,-0.132978,-0.074607,-0.21976,0.400771,0.085445,-0.088777,0.058192,-0.048714
46,0.174459,0.105726,-0.254314,0.028714,0.040441,-0.057964,0.026162,0.218641,0.161763,0.066703,...,0.440584,0.065106,0.087258,-0.119361,-0.112793,0.043858,0.104241,0.055075,0.152945,-0.047191
47,0.172397,0.196967,-0.013198,0.017484,-0.033739,-0.023985,-0.290964,-0.023231,-0.404459,0.118549,...,-0.190685,0.121874,-0.173352,0.176889,-0.161473,0.00933,-0.167425,-0.101618,0.211029,0.318904
48,0.170481,-0.227757,-0.224544,-0.047748,-0.036514,0.197768,-0.090961,-0.230511,-0.06502,0.256008,...,-0.132633,0.196698,0.262828,-0.021826,-0.045811,-0.26737,-0.070498,0.412109,-0.11104,-0.236384


Accuracy Logistic Regression: 0.6176470588235294
