In [None]:
#Required libraries
"""
!pip install imblearn
!pip install pyQt5
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install matplotlib
!pip install seeaborn
"""

In [1]:

import PyQt5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

%matplotlib qt 

import numpy as np

from dataloader import *
import preprocessing


from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import tree

#Using GridSearch to find the optimal value of K number of nearest neighbors
from sklearn.model_selection import GridSearchCV

#metrics for analysing our model
from sklearn.metrics import precision_score,accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# oversampling technique
from imblearn.over_sampling import SMOTE

#### Load the data

In [2]:
handler = Dataloader()

#### Scale data

scale all variables to have a mean of 0 and standard deviation of 1

In [3]:
data = preprocessing.scale(handler.data)

## Data Analysis and Visualization

In [4]:
print("Total number of samples/records : ",len(handler.data))
print("Maximum value in the features data",np.amax(handler.data))
print("Minimum value in the features data",np.amin(handler.data))

Total number of samples/records :  801
Maximum value in the features data 20.7788287118
Minimum value in the features data 0.0


In [5]:
unique, counts = np.unique(handler.labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.bar(unique,counts)
plt.title("Class Distribution")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 78, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136}


We notice that there is class imbalance and we can use *SMOTE(Synthetic Minority Oversampling Technique)* for increasing instances of the minority class

In [6]:
print("Number of samples:",len(data))
print("Number of genes:",len(data[0]))

Number of samples: 801
Number of genes: 20531


In [7]:
#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(handler.labels)

In [8]:
X_embedded = TSNE(n_components=2,init='random').fit_transform(handler.data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(2, figsize=(12, 12))
ax = fig.add_subplot()
ax.set_title("TSNE plot on original data")
ax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])
plt.show()

#### Split original dataset into train and test

In [9]:
#split  in test and train
X_train, X_test, y_train, y_test = train_test_split(handler.data, labels, test_size=0.20)

## Feature selection and Dimensionality reduction

In [10]:
#with pca extract eigen pairs that explains 95% of the variance in the data.
pca = PCA(n_components=0.95)

#SVD dimension reduction
svd = TruncatedSVD(n_components=531, random_state=0)

#simultanously calculate eigen pairs and transform our data into the new coordinate frame
principalComponents = pca.fit_transform(X_train)    #X_train_pca    #y_train
svd_reduced_data = svd.fit_transform(X_train)    #X_train_svd       #y_train

#check the amount of dimensions left after pca
print(principalComponents.shape)
print("Number of genes after dimension reduction using PCA:",principalComponents.shape[1])
print("Explained variance of new dataset using PCA:",pca.explained_variance_ratio_.sum())
print("")
print(svd_reduced_data.shape)
print("Number of genes after dimension reduction using SVD:",svd_reduced_data.shape[1])
print("Explained variance of new dataset using SVD:",svd.explained_variance_ratio_.sum())



(640, 397)
Number of genes after dimension reduction using PCA: 397
Explained variance of new dataset using PCA: 0.9501489407078344

(640, 531)
Number of genes after dimension reduction using SVD: 531
Explained variance of new dataset using SVD: 0.9826082141870891


In [None]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(3,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying PCA")
ax.scatter(principalComponents.T[0], principalComponents.T[1], principalComponents.T[2], color=color[labels])
plt.show()

In [None]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(4,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying SVD")
ax.scatter(svd_reduced_data.T[0], svd_reduced_data.T[1], svd_reduced_data.T[2], color=color[labels])
plt.show()

In [None]:
#split  in test and train
#X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_train, y_train, test_size=0.20)

#split  in test and train
#X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(svd_reduced_data, labels, test_size=0.20)

In [None]:
plt.figure(5, figsize=(12, 12))
ax = sn.heatmap(handler.data)
ax.set_title("Correlation heatmap of all dimensions on the original data")

#plt.figure(6, figsize=(12, 12))
#ax = sn.heatmap(principalComponents)
#ax.set_title("Correlation heatmap of all dimensions after PCA")

#plt.figure(7, figsize=(12, 12))
#ax = sn.heatmap(svd_reduced_data)
#ax.set_title("Correlation heatmap of all dimensions after SVD")


## Augmentation (SMOTE)

- As noticed from the classs distribution, there is class imbalance since there are very few samples in the 'COAD' class.
- We apply Synthetic Minority Oversampling Technique to oversample the classes such that each class has equal number of samples
- SMOTE is applied after feature extraction/reduction is done
- refer (https://arxiv.org/ftp/arxiv/papers/1403/1403.1949.pdf#:~:text=After%20running%20PCA%2C%20SMOTE%20resampling,after%20the%20running%20of%20PCA.)

In [11]:
oversample_pca = SMOTE(k_neighbors=5)
resampled_data_pca, resampled_labels_pca = oversample_pca.fit_resample(principalComponents, y_train) #X_resampled_pca_train

In [12]:
oversample_svd = SMOTE(k_neighbors=5)
resampled_data_svd, resampled_labels_svd = oversample_svd.fit_resample(svd_reduced_data, y_train)

In [13]:
oversample = SMOTE(k_neighbors=5)
resampled_data, resampled_labels = oversample.fit_resample(X_train, y_train)

In [14]:
unique, counts = np.unique(resampled_labels_pca, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))

Classes and number of instances for each class:
{0: 244, 1: 244, 2: 244, 3: 244, 4: 244}


In [None]:
X_embedded_pca = TSNE(n_components=2,init='random').fit_transform(resampled_data_pca)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(8, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_pca.T[0], X_embedded_pca.T[1], color=color[resampled_labels_pca])
ax.set_title("TSNE visualisation after PCA+SMOTE")
plt.show()

X_embedded_svd = TSNE(n_components=2,init='random').fit_transform(resampled_data_svd)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(9, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_svd.T[0], X_embedded_svd.T[1], color=color[resampled_labels_svd])
ax.set_title("TSNE visualisation after SVD+SMOTE")
plt.show()

X_embedded_original = TSNE(n_components=2,init='random').fit_transform(resampled_data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(10, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_original.T[0], X_embedded_original.T[1], color=color[resampled_labels])
ax.set_title("TSNE visualisation after SMOTE on original data")
plt.show()

In [None]:
#split in test and train
#X_train_resampled_pca, X_test_resampled_pca, y_train_resampled_pca, y_test_resampled_pca = train_test_split(resampled_data_pca, resampled_labels_pca, test_size=0.20)

#X_train_resampled_svd, X_test_resampled_svd, y_train_resampled_svd, y_test_resampled_svd = train_test_split(resampled_data_svd, resampled_labels_svd, test_size=0.20)

#X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(resampled_data, resampled_labels, test_size=0.20)

## Grid search

In [None]:
steps_KNN_pca = [('pca', PCA(n_components=0.95)), ('KNN', KNeighborsClassifier())]
model_KNN_gs_pca = Pipeline(steps=steps_KNN_pca)
parameters = {'KNN__n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
KNN_gs_pca = GridSearchCV(model_KNN_gs_pca, parameters,cv=10, verbose = 3)
KNN_gs_pca.fit(X_train_resampled_pca,y_train_resampled_pca)

print(KNN_gs_pca.best_params_)

#gridsearch for decision tree classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
parameters = {'tree__criterion':('entropy', 'gini'), 'tree__max_depth':[2,4,6,8,10,12,15,18,20], 'tree__max_features': ('sqrt', 'log2', None)}
pca_pipeline = GridSearchCV(model, parameters, verbose = 3)
pca_pipeline.fit(X_train_resampled_pca, y_train_resampled_pca)

plt.figure(11)
tree.plot_tree(pca_pipeline.best_estimator_['tree'],filled=True, fontsize=5)

print(pca_pipeline.best_params_)
print(pca_pipeline.score(X_test_pca, y_test_pca))

#gridsearch for forest classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('forest', RandomForestClassifier(n_jobs=-1))]
model = Pipeline(steps=steps_PCA)
parameters = {'forest__criterion':('entropy', 'gini'), 'forest__max_depth':[2,4,6,8,10,12,15,18,20], 'forest__max_features': ('auto', 'sqrt', 'log2', None)}
pca_pipeline_forest = GridSearchCV(model, parameters, verbose = 3)
pca_pipeline_forest.fit(X_train_resampled_pca, y_train_resampled_pca)

print(pca_pipeline_forest.best_params_)
print(pca_pipeline_forest.score(X_test_pca, y_test_pca))

## Classification

After the data reduction and augmentation step we have 4 datasets:

- Original data
- Original data + SMOTE
- Original data + PCA + SMOTE
- Original data + SVD + SMOTE

 
Next, we perform classification on these data on the following models 
- KNN: This is a baseline model
- Decision tree 
- Random forest

In [17]:
#knn results on original data
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy(original data): ", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on PCA+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(resampled_data_pca,resampled_labels_pca)
X_test_projected_pca = pca.transform(X_test)
y_pred = model.predict(X_test_projected_pca)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after PCA+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on SVD+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(resampled_data_svd, resampled_labels_svd)
X_test_projected_svd = svd.transform(X_test)
y_pred = model.predict(X_test_projected_svd)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after SVD+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on ORIGINAL+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(resampled_data, resampled_labels)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after ORIGINAL+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")


KNN Accuracy(original data):  0.9937888198757764
[[56  0  0  0  0]
 [ 0 18  0  0  0]
 [ 0  0 33  0  0]
 [ 1  0  0 28  0]
 [ 0  0  0  0 25]]
Classification report(ORIGINAL)
              precision    recall  f1-score   support

        BRCA       0.98      1.00      0.99        56
        COAD       1.00      1.00      1.00        18
        KIRC       1.00      1.00      1.00        33
        LUAD       1.00      0.97      0.98        29
        PRAD       1.00      1.00      1.00        25

    accuracy                           0.99       161
   macro avg       1.00      0.99      0.99       161
weighted avg       0.99      0.99      0.99       161

----------------
----------------
KNN Accuracy after PCA+SMOTE: 1.0
[[56  0  0  0  0]
 [ 0 18  0  0  0]
 [ 0  0 33  0  0]
 [ 0  0  0 29  0]
 [ 0  0  0  0 25]]
Classification report(PCA+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        56
        COAD       1.00      1.00     

In [29]:
#svd on non-smote
tree_svd = DecisionTreeClassifier(random_state=0)
tree_svd.fit(svd_reduced_data, y_train)
y_pred = tree_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

tree_pca = DecisionTreeClassifier(random_state=0)
tree_pca.fit(principalComponents, y_train)
y_pred = tree_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


tree_original = DecisionTreeClassifier(random_state=0)
tree_original.fit(X_train, y_train)
y_pred = tree_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

[[56  0  0  0  0]
 [ 0 18  0  0  0]
 [ 1  0 32  0  0]
 [ 1  0  1 27  0]
 [ 0  0  0  0 25]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.97      1.00      0.98        56
        COAD       1.00      1.00      1.00        18
        KIRC       0.97      0.97      0.97        33
        LUAD       1.00      0.93      0.96        29
        PRAD       1.00      1.00      1.00        25

    accuracy                           0.98       161
   macro avg       0.99      0.98      0.98       161
weighted avg       0.98      0.98      0.98       161

[[55  0  0  1  0]
 [ 0 17  0  1  0]
 [ 0  0 33  0  0]
 [ 1  0  1 27  0]
 [ 0  0  0  0 25]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.98      0.98      0.98        56
        COAD       1.00      0.94      0.97        18
        KIRC       0.97      1.00      0.99        33
        LUAD       0.93      0.93      0.93        29
    

In [30]:
#svd on smote
tree_svd = DecisionTreeClassifier(random_state=0)
tree_svd.fit(resampled_data_svd, resampled_labels_svd)
y_pred = tree_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

tree_pca = DecisionTreeClassifier(random_state=0)
tree_pca.fit(resampled_data_pca, resampled_labels_pca)
y_pred = tree_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


tree_original = DecisionTreeClassifier(random_state=0)
tree_original.fit(resampled_data, resampled_labels)
y_pred = tree_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

[[54  0  0  1  1]
 [ 0 18  0  0  0]
 [ 0  0 33  0  0]
 [ 3  0  1 25  0]
 [ 0  0  0  0 25]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.95      0.96      0.96        56
        COAD       1.00      1.00      1.00        18
        KIRC       0.97      1.00      0.99        33
        LUAD       0.96      0.86      0.91        29
        PRAD       0.96      1.00      0.98        25

    accuracy                           0.96       161
   macro avg       0.97      0.97      0.97       161
weighted avg       0.96      0.96      0.96       161

[[53  0  0  1  2]
 [ 0 17  0  1  0]
 [ 0  0 33  0  0]
 [ 0  0  1 28  0]
 [ 0  0  0  0 25]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       1.00      0.95      0.97        56
        COAD       1.00      0.94      0.97        18
        KIRC       0.97      1.00      0.99        33
        LUAD       0.93      0.97      0.95        29
    

### Random Forest

In [None]:
#svd on non-smote
forest_svd = RandomForestClassifier(random_state=0)
forest_svd.fit(svd_reduced_data, y_train)
y_pred = tree_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

forest_pca = RandomForestClassifier(random_state=0)
forest_pca.fit(principalComponents, y_train)
y_pred = tree_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


forest_original = RandomForestClassifier(random_state=0)
forest_original.fit(X_train, y_train)
y_pred = forest_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

In [31]:
#svd on smote
forest_svd = RandomForestClassifier(random_state=0)
forest_svd.fit(resampled_data_svd, resampled_labels_svd)
y_pred = forest_svd.predict(X_test_projected_svd)
print(confusion_matrix(y_test, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

forest_pca = RandomForestClassifier(random_state=0)
forest_pca.fit(resampled_data_pca, resampled_labels_pca)
y_pred = forest_pca.predict(X_test_projected_pca)
print(confusion_matrix(y_test, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


forest_original = RandomForestClassifier(random_state=0)
forest_original.fit(resampled_data, resampled_labels)
y_pred = forest_original.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

[[54  0  0  1  1]
 [ 0 18  0  0  0]
 [ 0  0 33  0  0]
 [ 3  0  1 25  0]
 [ 0  0  0  0 25]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.95      0.96      0.96        56
        COAD       1.00      1.00      1.00        18
        KIRC       0.97      1.00      0.99        33
        LUAD       0.96      0.86      0.91        29
        PRAD       0.96      1.00      0.98        25

    accuracy                           0.96       161
   macro avg       0.97      0.97      0.97       161
weighted avg       0.96      0.96      0.96       161

[[53  0  0  1  2]
 [ 0 17  0  1  0]
 [ 0  0 33  0  0]
 [ 0  0  1 28  0]
 [ 0  0  0  0 25]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       1.00      0.95      0.97        56
        COAD       1.00      0.94      0.97        18
        KIRC       0.97      1.00      0.99        33
        LUAD       0.93      0.97      0.95        29
    

## Clustering

In [36]:
#kmeans results on original data
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(principalComponents)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_pca)
print("K_means clustering(PCA) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on SVD
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(svd_reduced_data)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_svd)
print("K_means clustering(SVD) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

K_means clustering(original data) Normalised mutual info score:  0.9643177891634916
K_means clustering(PCA) Normalised mutual info score:  0.9643177891634916
K_means clustering(SVD) Normalised mutual info score:  0.9643177891634916


In [37]:
#kmeans results on original data
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(resampled_data)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(SMOTE+ORIGINAL) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(resampled_data_pca)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_pca)
print("K_means clustering(PCA+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on SVD
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(resampled_data_svd)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_projected_svd)
print("K_means clustering(SVD+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

K_means clustering(SMOTE+ORIGINAL) Normalised mutual info score:  0.9643177891634916
K_means clustering(PCA+SMOTE) Normalised mutual info score:  0.9809892086990412
K_means clustering(SVD+SMOTE) Normalised mutual info score:  0.9643177891634916


### Results
- Classification : we see good result with PCA + SMOTE 
- Clustering: we see good result with PCA+SMOTE

### Ensemble

In [None]:
knn_tree_ensemble = VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=1)), ('forest', RandomForestClassifier())], voting='hard')
knn_tree_ensemble.fit(X_train_pca, y_train_pca)

y_pred = knn_tree_ensemble.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Accuracy",)
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))