In [None]:
#Required libraries
"""
!pip install imblearn
!pip install pyQt5
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install matplotlib
"""

In [1]:

import PyQt5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

%matplotlib qt 

import numpy as np

from dataloader import *
import preprocessing


from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import tree

#Using GridSearch to find the optimal value of K number of nearest neighbors
from sklearn.model_selection import GridSearchCV

#metrics for analysing our model
from sklearn.metrics import precision_score,accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# oversampling technique
from imblearn.over_sampling import SMOTE

#### Load the data

In [2]:
handler = Dataloader()

#### Scale data

scale all variables to have a mean of 0 and standard deviation of 1

In [3]:
data = preprocessing.scale(handler.data)

## Data Analysis and Visualization

In [4]:
unique, counts = np.unique(handler.labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.bar(unique,counts)
plt.title("Class Distribution")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 78, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136}


We notice that there is class imbalance and we can use *SMOTE(Synthetic Minority Oversampling Technique)* for increasing instances of the minority class
We also can make use of downsampling to make sure each class is represented equally

In [5]:
print("Number of samples:",len(data))
print("Number of genes:",len(data[0]))

Number of samples: 801
Number of genes: 20531


In [6]:
#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(handler.labels)

In [7]:
X_embedded = TSNE(n_components=2,init='random').fit_transform(handler.data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(4, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])
plt.show()

#### Split original dataset into train and test

In [8]:
#split  in test and train
X_train, X_test, y_train, y_test = train_test_split(handler.data, labels, test_size=0.20)

## Feature selection and Dimensionality reduction

In [9]:
#with pca extract eigen pairs that explains 95% of the variance in the data.
pca = PCA(n_components=0.95)

#SVD dimension reduction
svd = TruncatedSVD(n_components=531, random_state=0)

#simultanously calculate eigen pairs and transform our data into the new coordinate frame
principalComponents = pca.fit_transform(data)
svd_reduced_data = svd.fit_transform(data)

#check the amount of dimensions left after pca
print(principalComponents.shape)
print("Number of genes after dimension reduction using PCA:",principalComponents.shape[1])
print("Explained variance of new dataset using PCA:",pca.explained_variance_ratio_.sum())
print("")
print(svd_reduced_data.shape)
print("Number of genes after dimension reduction using SVD:",svd_reduced_data.shape[1])
print("Explained variance of new dataset using SVD:",svd.explained_variance_ratio_.sum())



(801, 530)
Number of genes after dimension reduction using PCA: 530
Explained variance of new dataset using PCA: 0.9501213818001684

(801, 531)
Number of genes after dimension reduction using SVD: 531
Explained variance of new dataset using SVD: 0.9492336720567962


In [10]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(2,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying PCA")
ax.scatter(principalComponents.T[0], principalComponents.T[1], principalComponents.T[2], color=color[labels])
plt.show()

In [11]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(2,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying SVD")
ax.scatter(svd_reduced_data.T[0], svd_reduced_data.T[1], svd_reduced_data.T[2], color=color[labels])
plt.show()

In [12]:
#split  in test and train
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(principalComponents, labels, test_size=0.20)

#split  in test and train
X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(svd_reduced_data, labels, test_size=0.20)

In [None]:
plt.figure(3, figsize=(12, 12))
ax = sn.heatmap(X_train)

## Augmentation (SMOTE)

In [None]:
- SMOTE is applied after PCA is done
- refer (https://arxiv.org/ftp/arxiv/papers/1403/1403.1949.pdf#:~:text=After%20running%20PCA%2C%20SMOTE%20resampling,after%20the%20running%20of%20PCA.)

In [19]:
oversample_pca = SMOTE(k_neighbors=5)
resampled_data_pca, resampled_labels_pca = oversample_pca.fit_resample(principalComponents, labels)

In [20]:
oversample_svd = SMOTE(k_neighbors=5)
resampled_data_svd, resampled_labels_svd = oversample_svd.fit_resample(svd_reduced_data, labels)

In [21]:
oversample = SMOTE(k_neighbors=5)
resampled_data, resampled_labels = oversample.fit_resample(handler.data, labels)

In [16]:
unique, counts = np.unique(resampled_labels_pca, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 300, 'KIRC': 300, 'LUAD': 300, 'PRAD': 300}


In [23]:
X_embedded_pca = TSNE(n_components=2,init='random').fit_transform(resampled_data_pca)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(9, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_pca.T[0], X_embedded_pca.T[1], color=color[resampled_labels_pca])
plt.show()

X_embedded_svd = TSNE(n_components=2,init='random').fit_transform(resampled_data_svd)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(10, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_svd.T[0], X_embedded_svd.T[1], color=color[resampled_labels_svd])
plt.show()

X_embedded_original = TSNE(n_components=2,init='random').fit_transform(resampled_data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(11, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_original.T[0], X_embedded_original.T[1], color=color[resampled_labels])
plt.show()

In [25]:
#split in test and train
X_train_resampled_pca, X_test_resampled_pca, y_train_resampled_pca, y_test_resampled_pca = train_test_split(resampled_data_pca, resampled_labels_pca, test_size=0.20)

X_train_resampled_svd, X_test_resampled_svd, y_train_resampled_svd, y_test_resampled_svd = train_test_split(resampled_data_svd, resampled_labels_svd, test_size=0.20)

X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(resampled_data, resampled_labels, test_size=0.20)

## Classification

##### - first knn on all different datasets
##### - second decision trees on all different datasets
##### - third random forest on all different datasets

In [28]:
#knn results on original data
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy(original data): ", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#knn results on PCA+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled_pca,y_train_resampled_pca)
y_pred = model.predict(X_test_pca)
acc = accuracy_score(y_test_pca, y_pred)
print("KNN Accuracy after PCA+SMOTE:", acc)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#knn results on SVD+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled_svd,y_train_resampled_svd)
y_pred = model.predict(X_test_svd)
acc = accuracy_score(y_test_svd, y_pred)
print("KNN Accuracy after SVD+SMOTE:", acc)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#knn results on ORIGINAL+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled,y_train_resampled)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after ORIGINAL+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


KNN Accuracy(original data):  1.0
[[59  0  0  0  0]
 [ 0 14  0  0  0]
 [ 0  0 31  0  0]
 [ 0  0  0 32  0]
 [ 0  0  0  0 25]]
Classification report(ORIGINAL)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        59
        COAD       1.00      1.00      1.00        14
        KIRC       1.00      1.00      1.00        31
        LUAD       1.00      1.00      1.00        32
        PRAD       1.00      1.00      1.00        25

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

KNN Accuracy after PCA+SMOTE: 0.9875776397515528
[[59  0  0  2  0]
 [ 0 16  0  0  0]
 [ 0  0 24  0  0]
 [ 0  0  0 35  0]
 [ 0  0  0  0 25]]
Classification report(PCA+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      0.97      0.98        61
        COAD       1.00      1.00      1.00        16
        KIRC      

In [29]:
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_svd, y_train_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_pca, y_train_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

steps_ORIGINAL = [('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.972 (0.009)
[[52  0  1  1  1]
 [ 0 12  0  1  0]
 [ 4  0 31  0  0]
 [ 3  2  0 26  0]
 [ 0  0  1  0 26]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.88      0.95      0.91        55
        COAD       0.86      0.92      0.89        13
        KIRC       0.94      0.89      0.91        35
        LUAD       0.93      0.84      0.88        31
        PRAD       0.96      0.96      0.96        27

    accuracy                           0.91       161
   macro avg       0.91      0.91      0.91       161
weighted avg       0.91      0.91      0.91       161

Accuracy: 0.972 (0.008)
[[57  0  1  1  2]
 [ 0 16  0  0  0]
 [ 1  1 22  0  0]
 [ 1  1  1 32  0]
 [ 0  1  0  0 24]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.97      0.93      0.95        61
        COAD       0.84      1.00      0.91        16
        KIRC       0.92      0.92      0.92        24
        LU

In [30]:
#SVD + SMOTE
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_svd , y_train_resampled_svd, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_svd, y_train_resampled_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#PCA + SMOTE
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_pca , y_train_resampled_pca, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_pca, y_train_resampled_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#ORIGINAL + SMOTE
steps_ORIGINAL = [('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled , y_train_resampled, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.953 (0.024)
[[54  0  0  0  1]
 [ 0 13  0  0  0]
 [ 0  0 35  0  0]
 [ 0  0  1 30  0]
 [ 0  0  0  1 26]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       1.00      0.98      0.99        55
        COAD       1.00      1.00      1.00        13
        KIRC       0.97      1.00      0.99        35
        LUAD       0.97      0.97      0.97        31
        PRAD       0.96      0.96      0.96        27

    accuracy                           0.98       161
   macro avg       0.98      0.98      0.98       161
weighted avg       0.98      0.98      0.98       161

Accuracy: 0.968 (0.013)
[[60  1  0  0  0]
 [ 0 16  0  0  0]
 [ 1  0 23  0  0]
 [ 0  0  0 35  0]
 [ 0  0  0  0 25]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.98      0.98      0.98        61
        COAD       0.94      1.00      0.97        16
        KIRC       1.00      0.96      0.98        24
        LU

### Random Forest

In [31]:
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_svd, y_train_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_pca, y_train_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

steps_ORIGINAL = [('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.975 (0.014)
[[54  0  0  0  1]
 [ 0 13  0  0  0]
 [ 0  0 35  0  0]
 [ 5  0  1 25  0]
 [ 0  0  0  0 27]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.92      0.98      0.95        55
        COAD       1.00      1.00      1.00        13
        KIRC       0.97      1.00      0.99        35
        LUAD       1.00      0.81      0.89        31
        PRAD       0.96      1.00      0.98        27

    accuracy                           0.96       161
   macro avg       0.97      0.96      0.96       161
weighted avg       0.96      0.96      0.96       161

Accuracy: 0.988 (0.009)
[[61  0  0  0  0]
 [ 0 16  0  0  0]
 [ 2  0 22  0  0]
 [ 3  0  0 32  0]
 [ 1  0  0  0 24]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.91      1.00      0.95        61
        COAD       1.00      1.00      1.00        16
        KIRC       1.00      0.92      0.96        24
        LU

In [32]:
#SVD + SMOTE
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_svd , y_train_resampled_svd, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_svd, y_train_resampled_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#PCA + SMOTE
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_pca , y_train_resampled_pca, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_pca, y_train_resampled_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#ORIGINAL + SMOTE
steps_ORIGINAL = [('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled , y_train_resampled, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.983 (0.005)
[[54  0  0  0  1]
 [ 0 13  0  0  0]
 [ 0  0 35  0  0]
 [ 1  0  0 30  0]
 [ 0  0  0  0 27]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.98      0.98      0.98        55
        COAD       1.00      1.00      1.00        13
        KIRC       1.00      1.00      1.00        35
        LUAD       1.00      0.97      0.98        31
        PRAD       0.96      1.00      0.98        27

    accuracy                           0.99       161
   macro avg       0.99      0.99      0.99       161
weighted avg       0.99      0.99      0.99       161

Accuracy: 0.993 (0.005)
[[60  0  0  1  0]
 [ 0 16  0  0  0]
 [ 1  0 23  0  0]
 [ 0  0  0 35  0]
 [ 0  0  0  0 25]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.98      0.98      0.98        61
        COAD       1.00      1.00      1.00        16
        KIRC       1.00      0.96      0.98        24
        LU

## Clustering

In [37]:
#kmeans results on original data
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_pca)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_pca)
print("K_means clustering(PCA+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_pca, y_pred))

#kmeans results on SVD
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_svd)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_svd)
print("K_means clustering(SVD+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_svd, y_pred))

K_means clustering(original data) Normalised mutual info score:  1.0000000000000002
K_means clustering(PCA+SMOTE) Normalised mutual info score:  0.8973934492765573
K_means clustering(SVD+SMOTE) Normalised mutual info score:  0.9204556672160968


In [36]:
#kmeans results on original data + SMOTE
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA+SMOTE
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled_pca)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_pca)
print("K_means clustering(PCA+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_pca, y_pred))

#kmeans results on SVD+SMOTE
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled_svd)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_svd)
print("K_means clustering(SVD+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_svd, y_pred))

K_means clustering(original data) Normalised mutual info score:  1.0000000000000002
K_means clustering(PCA+SMOTE) Normalised mutual info score:  0.9484269175156541
K_means clustering(SVD+SMOTE) Normalised mutual info score:  0.9617647314480207


#### we see good result with PCA + SMOTE for classification and for clustering SVD+SMOTE

## Grid search


In [None]:
steps_KNN_pca = [('pca', PCA(n_components=0.95)), ('KNN', KNeighborsClassifier())]
model_KNN_gs_pca = Pipeline(steps=steps_KNN_pca)
parameters = {'KNN__n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
KNN_gs_pca = GridSearchCV(model_KNN_gs_pca, parameters,cv=10, verbose = 3)
KNN_gs_pca.fit(X_train_resampled_pca,y_train_resampled_pca)

In [None]:
print(KNN_gs_pca.best_params_)

In [41]:
#gridsearch for decision tree classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
parameters = {'tree__criterion':('entropy', 'gini'), 'tree__max_depth':[2,4,6,8,10,12,15,18,20], 'max_features': ('auto', 'sqrt', 'log2', 'None')}
pca_pipeline = GridSearchCV(model, parameters, verbose = 3)
pca_pipeline.fit(X_train_resampled_pca, y_train_resampled_pca)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END ...........tree__criterion=entropy;, score=0.971 total time=   0.2s
[CV 2/5] END ...........tree__criterion=entropy;, score=0.946 total time=   0.2s
[CV 3/5] END ...........tree__criterion=entropy;, score=0.963 total time=   0.2s
[CV 4/5] END ...........tree__criterion=entropy;, score=0.996 total time=   0.2s
[CV 5/5] END ...........tree__criterion=entropy;, score=0.958 total time=   0.2s
[CV 1/5] END ..............tree__criterion=gini;, score=0.979 total time=   0.1s
[CV 2/5] END ..............tree__criterion=gini;, score=0.950 total time=   0.1s
[CV 3/5] END ..............tree__criterion=gini;, score=0.971 total time=   0.1s
[CV 4/5] END ..............tree__criterion=gini;, score=0.975 total time=   0.1s
[CV 5/5] END ..............tree__criterion=gini;, score=0.971 total time=   0.1s


GridSearchCV(estimator=Pipeline(steps=[('pca', PCA(n_components=0.95)),
                                       ('tree', DecisionTreeClassifier())]),
             param_grid={'tree__criterion': ('entropy', 'gini')}, verbose=3)

In [48]:
tree.plot_tree(pca_pipeline.best_estimator_['tree'],filled=True, fontsize=5)

[Text(310.5166666666667, 354.2, 'X[1] <= 43.823\ngini = 0.8\nsamples = 1200\nvalue = [236, 238, 243, 243, 240]'),
 Text(246.9666666666667, 323.4, 'X[0] <= -44.805\ngini = 0.751\nsamples = 962\nvalue = [235, 238, 243, 243, 3]'),
 Text(169.4666666666667, 292.59999999999997, 'X[2] <= -31.064\ngini = 0.043\nsamples = 229\nvalue = [5, 0, 224, 0, 0]'),
 Text(152.93333333333334, 261.79999999999995, 'gini = 0.0\nsamples = 5\nvalue = [5, 0, 0, 0, 0]'),
 Text(186.00000000000003, 261.79999999999995, 'gini = 0.0\nsamples = 224\nvalue = [0, 0, 224, 0, 0]'),
 Text(324.4666666666667, 292.59999999999997, 'X[0] <= 41.655\ngini = 0.686\nsamples = 733\nvalue = [230, 238, 19, 243, 3]'),
 Text(219.0666666666667, 261.79999999999995, 'X[4] <= -0.378\ngini = 0.577\nsamples = 501\nvalue = [222, 20, 19, 237, 3]'),
 Text(124.00000000000001, 230.99999999999997, 'X[6] <= 37.392\ngini = 0.224\nsamples = 243\nvalue = [6, 20, 1, 213, 3]'),
 Text(82.66666666666667, 200.2, 'X[3] <= -0.938\ngini = 0.095\nsamples = 224\n

In [42]:
print(pca_pipeline.best_params_)
print(pca_pipeline.score(X_test_pca, y_test_pca))

{'tree__criterion': 'gini'}
0.9875776397515528


## TODO: plot mean accuracies of all models from gridsearch

In [None]:
#print(pca_pipeline.cv_results_['mean_test_score'])

In [None]:
#gridsearch for forest classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('forest', RandomForestClassifier(n_jobs=-1))]
model = Pipeline(steps=steps_PCA)
parameters = {'forest__criterion':('entropy', 'gini'), 'forest__max_depth':[2,4,6,8,10,12,15,18,20], 'forest__max_features': ('auto', 'sqrt', 'log2', 'None')}
pca_pipeline_forest = GridSearchCV(model, parameters, verbose = 3)
pca_pipeline_forest.fit(X_train_resampled_pca, y_train_resampled_pca)

In [None]:
print(pca_pipeline_forest.best_params_)
print(pca_pipeline_forest.score(X_test, y_test))

### Ensemble

In [None]:
knn_tree_ensemble = VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=1)), ('tree', DecisionTreeClassifier())], voting='hard')
knn_tree_ensemble.fit(X_train_pca, y_train_pca)

y_pred = knn_tree_ensemble.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))