In [1]:
#Required libraries
"""
!pip install imblearn
!pip install pyQt5
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install matplotlib
"""

'\n!pip install imblearn\n!pip install pyQt5\n!pip install numpy\n!pip install pandas\n!pip install sklearn\n!pip install matplotlib\n'

In [2]:

import PyQt5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

%matplotlib qt 

import numpy as np

from dataloader import *
import preprocessing


from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

#Using GridSearch to find the optimal value of K number of nearest neighbors
from sklearn.model_selection import GridSearchCV

#metrics for analysing our model
from sklearn.metrics import precision_score,accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import normalized_mutual_info_score

# oversampling technique
from imblearn.over_sampling import SMOTE

#### Load the data

In [3]:
handler = Dataloader()

#### Scale data

scale all variables to have a mean of 0 and standard deviation of 1

In [4]:
data = preprocessing.scale(handler.data)

#### Data Analysis and Visualization

In [5]:
unique, counts = np.unique(handler.labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.bar(unique,counts)
plt.title("Class Distribution")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 78, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136}


We notice that there is class imbalance and we can use *SMOTE(Synthetic Minority Oversampling Technique)* for increasing instances of the minority class
We also can make use of downsampling to make sure each class is represented equally

In [6]:
print("Number of samples:",len(data))
print("Number of genes:",len(data[0]))

Number of samples: 801
Number of genes: 20531


#### Split original dataset into train and test

In [7]:
#split  in test and train
X_train, X_test, y_train, y_test = train_test_split(handler.data, handler.labels, test_size=0.20)

#### Feature selection and Dimensionality reduction

In [8]:
#with pca extract eigen pairs that explains 95% of the variance in the data.
pca = PCA(n_components=0.95)

#simultanously calculate eigen pairs and transform our data into the new coordinate frame
principalComponents = pca.fit_transform(data)

#check the amount of dimensions left after pca
print(principalComponents.shape)
print("Number of genes after dimension reduction using PCA:",principalComponents.shape[1])

#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(handler.labels)

(801, 530)
Number of genes after dimension reduction using PCA: 530


In [9]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(2,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying PCA")
ax.scatter(principalComponents.T[0], principalComponents.T[1], principalComponents.T[2], color=color[labels])
plt.show()

In [10]:
#split  in test and train
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(principalComponents, labels, test_size=0.20)

In [11]:
"""X_embedded = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])
plt.show()"""

"X_embedded = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(data)\ncolor = np.array(['r', 'g', 'b', 'c', 'm'])\nfig = plt.figure(figsize=(12, 12))\nax = fig.add_subplot()\nax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])\nplt.show()"

### Checking Accuracy after PCA

In [12]:
"""#knn results on original data
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy(original data): ", acc)


#knn results after PCA
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_pca,y_train_pca)
y_pred_pca = model.predict(X_test_pca)
acc = accuracy_score(y_test_pca, y_pred_pca)
print("KNN Accuracy after PCA: ", acc)

# kmeans original dataset
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score:",normalized_mutual_info_score(y_test, y_pred))

# kmeans after PCA
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_pca)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred_pca = kmeans.predict(X_test_pca)
print("K_means clustering(PCA data) Normalised mutual info score:",normalized_mutual_info_score(y_test_pca, y_pred_pca))


#decision tree original
# Classifier performance on the original dataset
tree_original = DecisionTreeClassifier(random_state=0)
tree_original.fit(X_train, y_train)
y_pred = tree_original.predict(X_test)

#visualise decision tree
fig = plt.figure(5,figsize=(14,8))
plt.title("Visualize Decision Tree")
tree.plot_tree(tree_original,filled=True, fontsize=5)

#check accuracy
acc_tree = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree trained on original data",acc_tree)
confusion_matrix(y_test, y_pred)


# Classifier performance on PCA dataset
tree_resampled = DecisionTreeClassifier(random_state=0)
tree_resampled.fit(X_train_pca, y_train_pca)
y_pred_pca = tree_resampled.predict(X_test_pca)
#visualise tree after training on the resampled data
fig = plt.figure(6,figsize=(14,8))
plt.title("Visualize Decision Tree after training on resampled data")
tree.plot_tree(tree_resampled,filled=True, fontsize=5)
# check accuracy
acc_tree_resampled = accuracy_score(y_test, y_pred)
print("Accuracy of the decision tree trained on PCA+SMOTE",acc_tree_resampled)
confusion_matrix(y_test_pca, y_pred_pca)"""

KNN Accuracy(original data):  1.0
KNN Accuracy after PCA:  0.9937888198757764
K_means clustering(original data) Normalised mutual info score: 1.0
K_means clustering(PCA data) Normalised mutual info score: 0.9245056869779346
Accuracy of decision tree trained on original data 0.9503105590062112
Accuracy of the decision tree trained on PCA+SMOTE 0.9503105590062112


array([[44,  0,  2,  6,  0],
       [ 0, 18,  0,  1,  1],
       [ 0,  1, 33,  0,  0],
       [ 2,  2,  0, 24,  0],
       [ 0,  1,  0,  0, 26]], dtype=int64)

### Applying SMOTE

- SMOTE is applied after PCA is done
- refer (https://arxiv.org/ftp/arxiv/papers/1403/1403.1949.pdf#:~:text=After%20running%20PCA%2C%20SMOTE%20resampling,after%20the%20running%20of%20PCA.)

In [13]:
oversample = SMOTE(k_neighbors=5)
resampled_data, resampled_labels = oversample.fit_resample(principalComponents, handler.labels)

In [14]:
unique, counts = np.unique(resampled_labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.figure(3)
plt.bar(unique,counts)
plt.title("Class Distribution after SMOTE")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 300, 'KIRC': 300, 'LUAD': 300, 'PRAD': 300}


### Visualize the data after applying SMOTE

In [15]:
#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(resampled_labels)

# Visualize
#plot 3 most important principle components after SMOTE(3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(4,figsize=(12, 10))
ax = fig.add_subplot(projection='3d')
ax.scatter(resampled_data.T[0], resampled_data.T[1], resampled_data.T[2], color=color[labels])
ax.set_title("Visualise data after PCA+SMOTE")
plt.show()

#### Split SMOTE resampled data into train test splits

In [16]:
#split in test and train
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(resampled_data, labels, test_size=0.20)


### Check model accuracy after SMOTE

In [19]:
#knn results on original data
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy(original data): ", acc)

#knn results on PCA+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled,y_train_resampled)
y_pred = model.predict(X_test_pca)
acc = accuracy_score(y_test_pca, y_pred)
print("KNN Accuracy after PCA+SMOTE:", acc)

#kmeans results on original data
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA+SMOTE
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_pca)
print("K_means clustering(PCA+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_pca, y_pred))

KNN Accuracy(original data):  1.0
KNN Accuracy after PCA+SMOTE: 0.9875776397515528
K_means clustering(original data) Normalised mutual info score:  1.0
K_means clustering(PCA+SMOTE) Normalised mutual info score:  0.9814791744846576


### Requirements
###### Task Classification
 - Do classification on those samples
 - Classify the samples into 5 different tumour classes:BRCA,KIRC,LUAD,PRAD,COAD
 - Classifiers we choose are: 1) Decision Tree and 2) Random Forest

###### Task Clustering
 - Cluster the samples into 5 different clusters
 - Clustering algorithm we choose is K-means

### Classification - Decision Tree

In [23]:
# Classifier performance on the original dataset
tree_original = DecisionTreeClassifier(random_state=0)
tree_original.fit(X_train, y_train)
y_pred = tree_original.predict(X_test)

#visualise decision tree
fig = plt.figure(5,figsize=(14,8))
plt.title("Visualize Decision Tree")
tree.plot_tree(tree_original,filled=True, fontsize=5)

#check accuracy
acc_tree = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree trained on original data",acc_tree)
print(confusion_matrix(y_test, y_pred))



# Classifier performance on PCA+SMOTE dataset
tree_resampled = DecisionTreeClassifier(random_state=0)
tree_resampled.fit(X_train_resampled, y_train_resampled)
y_pred = tree_resampled.predict(X_test_resampled)

#visualise tree after training on the resampled data
fig = plt.figure(6,figsize=(14,8))
plt.title("Visualize Decision Tree after training on resampled data")
tree.plot_tree(tree_resampled,filled=True, fontsize=5)

# check accuracy
acc_tree_resampled = accuracy_score(y_test_resampled, y_pred)
print("Accuracy of the decision tree trained on PCA+SMOTE",acc_tree_resampled)
print(confusion_matrix(y_test_resampled, y_pred))

Accuracy of decision tree trained on original data 0.9503105590062112
[[65  1  0  0  0]
 [ 0 18  0  0  0]
 [ 1  0 24  0  0]
 [ 0  3  0 24  0]
 [ 3  0  0  0 22]]
Accuracy of the decision tree trained on PCA+SMOTE 0.9233333333333333
[[56  3  3  5  0]
 [ 1 54  0  1  1]
 [ 1  0 55  0  0]
 [ 3  0  2 57  2]
 [ 1  0  0  0 55]]


### Classification - Random Forest

In [24]:
# Classifier performance on original dataset
forest = RandomForestClassifier(random_state=0, criterion = 'gini')
forest.fit(X_train, y_train)
y_pred_forest = forest.predict(X_test)
acc_forest = accuracy_score(y_test, y_pred_forest)
print(acc_forest)
print(confusion_matrix(y_test, y_pred_forest))


# Classifier performance after PCA+SMOTE
forest = RandomForestClassifier(random_state=0, criterion = 'gini')
forest.fit(X_train_resampled, y_train_resampled)
y_pred_forest = forest.predict(X_test_resampled)
acc_forest = accuracy_score(y_test_resampled, y_pred_forest)
print(acc_forest)
confusion_matrix(y_test_resampled, y_pred_forest)

1.0
[[66  0  0  0  0]
 [ 0 18  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 27  0]
 [ 0  0  0  0 25]]
0.99


array([[66,  0,  1,  0,  0],
       [ 0, 57,  0,  0,  0],
       [ 1,  0, 55,  0,  0],
       [ 0,  0,  0, 64,  0],
       [ 1,  0,  0,  0, 55]], dtype=int64)

### Clustering - Kmeans

### Trying Grid search and cross validation


In [None]:
KNN_gs = KNeighborsClassifier()
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
KNN_gs = GridSearchCV(KNN_gs, parameters,cv=10)
KNN_gs.fit(X_train,y_train)