In [1]:
#Required libraries
"""
!pip install imblearn
!pip install pyQt5
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install matplotlib
!pip install seeaborn
"""

'\n!pip install imblearn\n!pip install pyQt5\n!pip install numpy\n!pip install pandas\n!pip install sklearn\n!pip install matplotlib\n!pip install seeaborn\n'

In [2]:

import PyQt5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

%matplotlib qt 

import numpy as np

from dataloader import *
import preprocessing


from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import tree

#Using GridSearch to find the optimal value of K number of nearest neighbors
from sklearn.model_selection import GridSearchCV

#metrics for analysing our model
from sklearn.metrics import precision_score,accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# oversampling technique
from imblearn.over_sampling import SMOTE


np.random.seed(1234)

#### Load the data

In [3]:
handler = Dataloader()

#### Scale data

scale all variables to have a mean of 0 and standard deviation of 1

In [4]:
data = preprocessing.scale(handler.data)

## Data Analysis and Visualization

In [5]:
print("Total number of samples/records : ",len(handler.data))
print("Maximum value in the features data",np.amax(handler.data))
print("Minimum value in the features data",np.amin(handler.data))

Total number of samples/records :  801
Maximum value in the features data 20.7788287118
Minimum value in the features data 0.0


In [6]:
unique, counts = np.unique(handler.labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.bar(unique,counts)
plt.title("Class Distribution")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 78, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136}


We notice that there is class imbalance and we can use *SMOTE(Synthetic Minority Oversampling Technique)* for increasing instances of the minority class

In [7]:
print("Number of samples:",len(data))
print("Number of genes:",len(data[0]))

Number of samples: 801
Number of genes: 20531


In [8]:
#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(handler.labels)

In [9]:
X_embedded = TSNE(n_components=2,init='random').fit_transform(handler.data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(2, figsize=(12, 12))
ax = fig.add_subplot()
ax.set_title("TSNE plot on original data")
ax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])
plt.show()



#### Split original dataset into train and test

In [10]:
#split  in test and train
X_train, X_test, y_train, y_test = train_test_split(handler.data, labels, test_size=0.20)

## Feature selection and Dimensionality reduction

In [11]:
#with pca extract eigen pairs that explains 95% of the variance in the data.
pca = PCA(n_components=0.95)

#SVD dimension reduction
svd = TruncatedSVD(n_components=403, random_state=0)

#simultanously calculate eigen pairs and transform our data into the new coordinate frame
principalComponents = pca.fit_transform(X_train)    #X_train_pca    #y_train
svd_reduced_data = svd.fit_transform(X_train)    #X_train_svd       #y_train

#check the amount of dimensions left after pca
print(principalComponents.shape)
print("Number of genes after dimension reduction using PCA:",principalComponents.shape[1])
print("Explained variance of new dataset using PCA:",pca.explained_variance_ratio_.sum())
print("")
print(svd_reduced_data.shape)
print("Number of genes after dimension reduction using SVD:",svd_reduced_data.shape[1])
print("Explained variance of new dataset using SVD:",svd.explained_variance_ratio_.sum())

(640, 396)
Number of genes after dimension reduction using PCA: 396
Explained variance of new dataset using PCA: 0.9502450352369812

(640, 403)
Number of genes after dimension reduction using SVD: 403
Explained variance of new dataset using SVD: 0.9511290020132728


In [12]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(3,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying PCA")
ax.scatter(principalComponents.T[0], principalComponents.T[1], principalComponents.T[2], color=color[y_train])
plt.show()

In [13]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(4,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying SVD")
ax.scatter(svd_reduced_data.T[0], svd_reduced_data.T[1], svd_reduced_data.T[2], color=color[y_train])
plt.show()

In [14]:
#split  in test and train
#X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_train, y_train, test_size=0.20)

#split  in test and train
#X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(svd_reduced_data, labels, test_size=0.20)

In [15]:
plt.figure(5, figsize=(12, 12))
ax = sn.heatmap(handler.data)
ax.set_title("Correlation heatmap of all dimensions on the original data")

#plt.figure(6, figsize=(12, 12))
#ax = sn.heatmap(principalComponents)
#ax.set_title("Correlation heatmap of all dimensions after PCA")

#plt.figure(7, figsize=(12, 12))
#ax = sn.heatmap(svd_reduced_data)
#ax.set_title("Correlation heatmap of all dimensions after SVD")


Text(0.5, 1.0, 'Correlation heatmap of all dimensions on the original data')

## Augmentation (SMOTE)

- As noticed from the classs distribution, there is class imbalance since there are very few samples in the 'COAD' class.
- We apply Synthetic Minority Oversampling Technique to oversample the classes such that each class has equal number of samples
- SMOTE is applied after feature extraction/reduction is done
- refer (https://arxiv.org/ftp/arxiv/papers/1403/1403.1949.pdf#:~:text=After%20running%20PCA%2C%20SMOTE%20resampling,after%20the%20running%20of%20PCA.)

In [16]:
oversample_pca = SMOTE(k_neighbors=5)
resampled_data_pca, resampled_labels_pca = oversample_pca.fit_resample(principalComponents, y_train) #X_resampled_pca_train

In [17]:
oversample_svd = SMOTE(k_neighbors=5)
resampled_data_svd, resampled_labels_svd = oversample_svd.fit_resample(svd_reduced_data, y_train)

In [18]:
oversample = SMOTE(k_neighbors=5)
resampled_data, resampled_labels = oversample.fit_resample(X_train, y_train)

In [19]:
unique, counts = np.unique(resampled_labels_pca, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))

Classes and number of instances for each class:
{0: 243, 1: 243, 2: 243, 3: 243, 4: 243}


In [20]:
X_embedded_pca = TSNE(n_components=2,init='random').fit_transform(resampled_data_pca)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(8, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_pca.T[0], X_embedded_pca.T[1], color=color[resampled_labels_pca])
ax.set_title("TSNE visualisation after PCA+SMOTE")
plt.show()

X_embedded_svd = TSNE(n_components=2,init='random').fit_transform(resampled_data_svd)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(9, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_svd.T[0], X_embedded_svd.T[1], color=color[resampled_labels_svd])
ax.set_title("TSNE visualisation after SVD+SMOTE")
plt.show()

X_embedded_original = TSNE(n_components=2,init='random').fit_transform(resampled_data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(10, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_original.T[0], X_embedded_original.T[1], color=color[resampled_labels])
ax.set_title("TSNE visualisation after SMOTE on original data")
plt.show()



In [21]:
# REMOVE
#split in test and train
#X_train_resampled_pca, X_test_resampled_pca, y_train_resampled_pca, y_test_resampled_pca = train_test_split(resampled_data_pca, resampled_labels_pca, test_size=0.20)

#X_train_resampled_svd, X_test_resampled_svd, y_train_resampled_svd, y_test_resampled_svd = train_test_split(resampled_data_svd, resampled_labels_svd, test_size=0.20)

#X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(resampled_data, resampled_labels, test_size=0.20)

## Classification

After the data reduction and augmentation step we have 4 datasets:

- Original data
- Original data + SMOTE
- Original data + PCA + SMOTE
- Original data + SVD + SMOTE

 
Next, we perform classification on these data on the following models 
- KNN: This is a baseline model
- Decision tree 
- Random forest

In [22]:
#knn results on original data
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy(original data): ", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on PCA
model = KNeighborsClassifier(n_neighbors=10)
model.fit(principalComponents,y_train)
X_test_projected_pca = pca.transform(X_test)
y_pred = model.predict(X_test_projected_pca)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after PCA:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on SVD
model = KNeighborsClassifier(n_neighbors=10)
model.fit(svd_reduced_data, y_train)
X_test_projected_svd = svd.transform(X_test)
y_pred = model.predict(X_test_projected_svd)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after SVD:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on ORIGINAL+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(resampled_data, resampled_labels)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after ORIGINAL+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on PCA+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(resampled_data_pca,resampled_labels_pca)
X_test_projected_pca = pca.transform(X_test)
y_pred = model.predict(X_test_projected_pca)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after PCA+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on SVD+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(resampled_data_svd, resampled_labels_svd)
X_test_projected_svd = svd.transform(X_test)
y_pred = model.predict(X_test_projected_svd)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after SVD+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

KNN Accuracy(original data):  1.0
[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Classification report(ORIGINAL)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        33
        PRAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

----------------
----------------
KNN Accuracy after PCA: 1.0
[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIR

In [23]:
# Decision tree results on original data
tree_original = DecisionTreeClassifier(random_state=0)
tree_original.fit(X_train, y_train)
y_pred = tree_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

# Decision tree results on PCA
tree_pca = DecisionTreeClassifier(random_state=0)
tree_pca.fit(principalComponents, y_train)
y_pred = tree_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


#Decision tree results on svd
tree_svd = DecisionTreeClassifier(random_state=0)
tree_svd.fit(svd_reduced_data, y_train)
y_pred = tree_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Classification report(ORIGINAL)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        33
        PRAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  2  0 31  0]
 [ 0  0  0  0 31]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       0.88      1.00      0.94        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      0.94      0.97        33

In [24]:
#Decision tree orgiinaldata+smote
tree_original = DecisionTreeClassifier(random_state=0)
tree_original.fit(resampled_data, resampled_labels)
y_pred = tree_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#Decision tree pca+smote
tree_pca = DecisionTreeClassifier(random_state=0)
tree_pca.fit(resampled_data_pca, resampled_labels_pca)
y_pred = tree_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#Decision tree svd+smote
tree_svd = DecisionTreeClassifier(random_state=0)
tree_svd.fit(resampled_data_svd, resampled_labels_svd)
y_pred = tree_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


[[56  0  0  1  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 3  0  1 29  0]
 [ 0  0  0  0 31]]
Classification report(ORIGINAL+SMOTE)
              precision    recall  f1-score   support

        BRCA       0.95      0.98      0.97        57
        COAD       1.00      1.00      1.00        15
        KIRC       0.96      1.00      0.98        25
        LUAD       0.97      0.88      0.92        33
        PRAD       1.00      1.00      1.00        31

    accuracy                           0.97       161
   macro avg       0.98      0.97      0.97       161
weighted avg       0.97      0.97      0.97       161

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  1  0  0 30]]
Classification report(PCA+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       0.94      1.00      0.97        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.

### Random Forest

In [25]:
#Random forest results on original data
forest_original = RandomForestClassifier(random_state=0)
forest_original.fit(X_train, y_train)
y_pred = forest_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#Random forest results on PCA
forest_pca = RandomForestClassifier(random_state=0)
forest_pca.fit(principalComponents, y_train)
y_pred = tree_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#Random forest results on SVD 
forest_svd = RandomForestClassifier(random_state=0)
forest_svd.fit(svd_reduced_data, y_train)
y_pred = tree_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Classification report(ORIGINAL)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        33
        PRAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  1  0  0 30]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       0.94      1.00      0.97        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        33

In [27]:
# Random forest results on original data+SMOTE
forest_original = RandomForestClassifier(random_state=0)
forest_original.fit(resampled_data, resampled_labels)
y_pred = forest_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

# Random forest results on PCA+SMOTE
forest_pca = RandomForestClassifier(random_state=0)
forest_pca.fit(resampled_data_pca, resampled_labels_pca)
y_pred = forest_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

# Random forest results on SVD+SMOTE
forest_svd = RandomForestClassifier(random_state=0)
forest_svd.fit(resampled_data_svd, resampled_labels_svd)
y_pred = forest_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Classification report(ORIGINAL+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        33
        PRAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 1  0  0  0 30]]
Classification report(PCA+SMOTE)
              precision    recall  f1-score   support

        BRCA       0.98      1.00      0.99        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.

## Clustering

In [28]:
#kmeans results on original data
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(principalComponents)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_pca)
print("K_means clustering(PCA) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on SVD
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(svd_reduced_data)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_svd)
print("K_means clustering(SVD) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

K_means clustering(original data) Normalised mutual info score:  1.0
K_means clustering(PCA) Normalised mutual info score:  1.0
K_means clustering(SVD) Normalised mutual info score:  0.9999999999999999


In [29]:
#kmeans results on original data
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(resampled_data)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(SMOTE+ORIGINAL) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(resampled_data_pca)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_pca)
print("K_means clustering(PCA+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on SVD
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(resampled_data_svd)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_svd)
print("K_means clustering(SVD+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

K_means clustering(SMOTE+ORIGINAL) Normalised mutual info score:  1.0
K_means clustering(PCA+SMOTE) Normalised mutual info score:  1.0
K_means clustering(SVD+SMOTE) Normalised mutual info score:  1.0


### Results
- Classification : we see good result with PCA + SMOTE 
- Clustering: we see good result with PCA+SMOTE
Intuition for choosing PCA over SVD even if we see similar results in few models:
- PCA has lower number of features which explain 95% of variance and achieves similar results
- So a model which considers only 398 dimensions is preferred over 403 dimensions 

### Grid Search
- perform grid search using the preferred data(PCA+SMOTE)
- define the pipeline and use grid search with 5-fold cross validation

In [34]:
steps_KNN_pca = [('pca', PCA(n_components=0.95)), ('KNN', KNeighborsClassifier())]
model_KNN_gs_pca = Pipeline(steps=steps_KNN_pca)
parameters = {'KNN__n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
KNN_gs_pca = GridSearchCV(model_KNN_gs_pca, parameters,cv=5, verbose = 3)
KNN_gs_pca.fit(resampled_data_pca,resampled_labels_pca)
print(KNN_gs_pca.score(X_test_projected_pca, y_test))
print(KNN_gs_pca.best_params_)

#gridsearch for decision tree classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
parameters = {'tree__criterion':('entropy', 'gini'), 'tree__max_depth':[2,4,6,8,10,12,15,18,20], 'tree__max_features': ('sqrt', 'log2', None)}
dt_gs_pca = GridSearchCV(model, parameters,cv=5, verbose = 3)
dt_gs_pca.fit(resampled_data_pca, resampled_labels_pca)

plt.figure(11)
tree.plot_tree(dt_gs_pca.best_estimator_['tree'],filled=True, fontsize=5)
print(dt_gs_pca.best_params_)
X_test_projected_pca = pca.transform(X_test)
print(dt_gs_pca.score(X_test_projected_pca, y_test))

#gridsearch for forest classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('forest', RandomForestClassifier(n_jobs=-1))]
model = Pipeline(steps=steps_PCA)
parameters = {'forest__criterion':('entropy', 'gini'), 'forest__max_depth':[2,4,6,8,10,12,15,18,20], 'forest__max_features': ('auto', 'sqrt', 'log2', None)}
rf_gs_pca = GridSearchCV(model, parameters,cv=5, verbose = 3)
rf_gs_pca.fit(resampled_data_pca, resampled_labels_pca)
X_test_projected_pca = pca.transform(X_test)
print(rf_gs_pca.best_params_)
print(rf_gs_pca.score(X_test_projected_pca, y_test))

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END ................KNN__n_neighbors=1;, score=1.000 total time=   0.1s
[CV 2/5] END ................KNN__n_neighbors=1;, score=1.000 total time=   0.1s
[CV 3/5] END ................KNN__n_neighbors=1;, score=1.000 total time=   0.1s
[CV 4/5] END ................KNN__n_neighbors=1;, score=1.000 total time=   0.1s
[CV 5/5] END ................KNN__n_neighbors=1;, score=1.000 total time=   0.1s
[CV 1/5] END ................KNN__n_neighbors=3;, score=1.000 total time=   0.1s
[CV 2/5] END ................KNN__n_neighbors=3;, score=1.000 total time=   0.1s
[CV 3/5] END ................KNN__n_neighbors=3;, score=1.000 total time=   0.1s
[CV 4/5] END ................KNN__n_neighbors=3;, score=1.000 total time=   0.1s
[CV 5/5] END ................KNN__n_neighbors=3;, score=1.000 total time=   0.1s
[CV 1/5] END ................KNN__n_neighbors=5;, score=1.000 total time=   0.1s
[CV 2/5] END ................KNN__n_neighbors=5;

[CV 4/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=sqrt;, score=0.527 total time=   0.1s
[CV 5/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=sqrt;, score=0.621 total time=   0.1s
[CV 1/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=log2;, score=0.568 total time=   0.1s
[CV 2/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=log2;, score=0.774 total time=   0.1s
[CV 3/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=log2;, score=0.346 total time=   0.1s
[CV 4/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=log2;, score=0.391 total time=   0.1s
[CV 5/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=log2;, score=0.798 total time=   0.1s
[CV 1/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=None;, score=0.988 total time=   0.2s
[CV 2/5] END tree__criterion=entropy, tree__max_depth=6, tree__max_features=None;, score

[CV 2/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=sqrt;, score=0.790 total time=   0.1s
[CV 3/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=sqrt;, score=0.881 total time=   0.1s
[CV 4/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=sqrt;, score=0.802 total time=   0.1s
[CV 5/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=sqrt;, score=0.934 total time=   0.1s
[CV 1/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=log2;, score=0.753 total time=   0.1s
[CV 2/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=log2;, score=0.827 total time=   0.1s
[CV 3/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=log2;, score=0.823 total time=   0.1s
[CV 4/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=log2;, score=0.807 total time=   0.1s
[CV 5/5] END tree__criterion=entropy, tree__max_depth=18, tree__max_features=log

[CV 5/5] END tree__criterion=gini, tree__max_depth=6, tree__max_features=None;, score=0.996 total time=   0.1s
[CV 1/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=sqrt;, score=0.877 total time=   0.1s
[CV 2/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=sqrt;, score=0.852 total time=   0.1s
[CV 3/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=sqrt;, score=0.897 total time=   0.1s
[CV 4/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=sqrt;, score=0.420 total time=   0.1s
[CV 5/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=sqrt;, score=0.988 total time=   0.1s
[CV 1/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=log2;, score=0.720 total time=   0.1s
[CV 2/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=log2;, score=0.494 total time=   0.1s
[CV 3/5] END tree__criterion=gini, tree__max_depth=8, tree__max_features=log2;, score=0.457 total time=   0.1s
[

[CV 4/5] END tree__criterion=gini, tree__max_depth=18, tree__max_features=None;, score=1.000 total time=   0.1s
[CV 5/5] END tree__criterion=gini, tree__max_depth=18, tree__max_features=None;, score=0.996 total time=   0.1s
[CV 1/5] END tree__criterion=gini, tree__max_depth=20, tree__max_features=sqrt;, score=0.885 total time=   0.1s
[CV 2/5] END tree__criterion=gini, tree__max_depth=20, tree__max_features=sqrt;, score=0.901 total time=   0.1s
[CV 3/5] END tree__criterion=gini, tree__max_depth=20, tree__max_features=sqrt;, score=0.959 total time=   0.1s
[CV 4/5] END tree__criterion=gini, tree__max_depth=20, tree__max_features=sqrt;, score=0.971 total time=   0.1s
[CV 5/5] END tree__criterion=gini, tree__max_depth=20, tree__max_features=sqrt;, score=0.938 total time=   0.1s
[CV 1/5] END tree__criterion=gini, tree__max_depth=20, tree__max_features=log2;, score=0.798 total time=   0.1s
[CV 2/5] END tree__criterion=gini, tree__max_depth=20, tree__max_features=log2;, score=0.737 total time=

[CV 3/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=log2;, score=1.000 total time=   0.4s
[CV 4/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=log2;, score=1.000 total time=   0.4s
[CV 5/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=log2;, score=1.000 total time=   0.4s
[CV 1/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=None;, score=0.988 total time=   1.5s
[CV 2/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=None;, score=0.988 total time=   1.6s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=None;, score=1.000 total time=   1.7s
[CV 4/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=None;, score=0.992 total time=   1.7s
[CV 5/5] END forest__criterion=entropy, forest__max_depth=6, forest__max_features=None;, score=1.000 total time=   1.7s
[CV 1/5] END forest__criterion=entropy, 

[CV 1/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=auto;, score=0.988 total time=   0.5s
[CV 2/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=auto;, score=0.996 total time=   0.5s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=auto;, score=1.000 total time=   0.5s
[CV 4/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=auto;, score=1.000 total time=   0.6s
[CV 5/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=auto;, score=0.996 total time=   0.5s
[CV 1/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=sqrt;, score=0.988 total time=   0.5s
[CV 2/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=sqrt;, score=0.996 total time=   0.5s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=sqrt;, score=1.000 total time=   0.6s
[CV 4/5] END forest__criterion=e

[CV 4/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=sqrt;, score=0.984 total time=   0.4s
[CV 5/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=sqrt;, score=0.984 total time=   0.4s
[CV 1/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=log2;, score=0.926 total time=   0.4s
[CV 2/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=log2;, score=0.947 total time=   0.4s
[CV 3/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=log2;, score=0.988 total time=   0.4s
[CV 4/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=log2;, score=0.996 total time=   0.4s
[CV 5/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=log2;, score=0.992 total time=   0.4s
[CV 1/5] END forest__criterion=gini, forest__max_depth=2, forest__max_features=None;, score=0.975 total time=   0.6s
[CV 2/5] END forest__criterion=gini, forest__max_depth=2, forest

[CV 5/5] END forest__criterion=gini, forest__max_depth=8, forest__max_features=None;, score=1.000 total time=   1.0s
[CV 1/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=auto;, score=0.984 total time=   0.4s
[CV 2/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=auto;, score=0.996 total time=   0.4s
[CV 3/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=auto;, score=1.000 total time=   0.4s
[CV 4/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=auto;, score=1.000 total time=   0.4s
[CV 5/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=auto;, score=1.000 total time=   0.4s
[CV 1/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=sqrt;, score=0.984 total time=   0.4s
[CV 2/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=sqrt;, score=0.988 total time=   0.4s
[CV 3/5] END forest__criterion=gini, forest__max_depth=10

[CV 5/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=sqrt;, score=1.000 total time=   0.4s
[CV 1/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=log2;, score=0.992 total time=   0.4s
[CV 2/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=log2;, score=0.996 total time=   0.4s
[CV 3/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=log2;, score=1.000 total time=   0.4s
[CV 4/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=log2;, score=1.000 total time=   0.4s
[CV 5/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=log2;, score=1.000 total time=   0.4s
[CV 1/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=None;, score=0.984 total time=   0.9s
[CV 2/5] END forest__criterion=gini, forest__max_depth=18, forest__max_features=None;, score=0.979 total time=   0.9s
[CV 3/5] END forest__criterion=gini, forest__max_depth=1

#### Report the best hyperparameters chosen by gridsearch
- The gridsearch models are trained with 5-fold cross validation

In [33]:
print("After gridsearch with a cross validation of 5-fold")
print("Best hyperparameters for KNN model are")
print(KNN_gs_pca.best_params_)

print("-----------------------------")
print("Best hyperparameters for Decision tree model are")
print(dt_gs_pca.best_params_)

print("-----------------------------")
print("Best hyperparameters for Random forest model are")
print(rf_gs_pca.best_params_)


After gridsearch with a cross validation of 5-fold
Best hyperparameters for KNN model are
{'KNN__n_neighbors': 1}
-----------------------------
Best hyperparameters for Decision tree model are
{'tree__criterion': 'entropy', 'tree__max_depth': 12, 'tree__max_features': None}
-----------------------------
Best hyperparameters for Random forest model are
{'forest__criterion': 'entropy', 'forest__max_depth': 8, 'forest__max_features': 'log2'}


#### compare the best models with and without cross validation
- compare cross validation models and the models with the same hyperparameters which did not train using cross validation

In [44]:
# 
# KNN with the best hyperparameter 
print("KNN model with the best chosen hyperparameters")
model = KNeighborsClassifier(n_neighbors=1) # the best hyperparameter we got from gridsearch: n_neighbours=1
model.fit(resampled_data_pca,resampled_labels_pca)
X_test_projected_pca = pca.transform(X_test)
y_pred = model.predict(X_test_projected_pca)
acc = accuracy_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print("Classification report without CV:")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

y_pred = KNN_gs_pca.predict(X_test_projected_pca)
acc = accuracy_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print("Classification report KNN with CV")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

print("----------------------------------------------------------")
print("----------------------------------------------------------")
print("----------------------------------------------------------")

# Decision tree with best hyperparameters
print("Decision Tree model with the best chosen hyperparameters")
tree_pca = DecisionTreeClassifier(random_state=0,criterion='entropy', max_depth=12, max_features=None)
tree_pca.fit(resampled_data_pca, resampled_labels_pca)
y_pred = tree_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report without CV")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

y_pred = dt_gs_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report with CV")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

print("----------------------------------------------------------")
print("----------------------------------------------------------")
print("----------------------------------------------------------")


# Random forest with best hyperparameters
print("Random forest model with the best chosen hyperparameters")
forest_pca = RandomForestClassifier(random_state=0,criterion = "entropy", max_depth=8, max_features = 'log2')
forest_pca.fit(resampled_data_pca, resampled_labels_pca)
y_pred = forest_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report without CV")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

y_pred = rf_gs_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report with CV")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

KNN model with the best chosen hyperparameters
[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Classification report without CV:
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        33
        PRAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Classification report KNN with CV
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        

- Notice how Cross validation improves the f1-score,precision, recall and accuracy of the model significantly.

### Ensemble

In [48]:
knn_tree_ensemble = VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=1)), ('forest', RandomForestClassifier())], voting='hard')
knn_tree_ensemble.fit(resampled_data_pca, resampled_labels_pca)

y_pred = knn_tree_ensemble.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Accuracy",)
print("Classification report(PCA)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

[[57  0  0  0  0]
 [ 0 15  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 33  0]
 [ 0  0  0  0 31]]
Accuracy
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        57
        COAD       1.00      1.00      1.00        15
        KIRC       1.00      1.00      1.00        25
        LUAD       1.00      1.00      1.00        33
        PRAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

