In [1]:
#Required libraries
"""
!pip install imblearn
!pip install pyQt5
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install matplotlib
"""

'\n!pip install imblearn\n!pip install pyQt5\n!pip install numpy\n!pip install pandas\n!pip install sklearn\n!pip install matplotlib\n'

In [23]:
import PyQt5
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib qt 

import numpy as np

from dataloader import *
import preprocessing


from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE

#Using GridSearch to find the optimal value of K number of nearest neighbors
from sklearn.model_selection import GridSearchCV

#metrics for analysing our model
from sklearn.metrics import precision_score,accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


# oversampling technique
from imblearn.over_sampling import SMOTE

#### Load the data

In [3]:
handler = Dataloader()

#### Scale data

scale all variables to have a mean of 0 and standard deviation of 1

In [4]:
data = preprocessing.scale(handler.data)

#### Data Analysis and Visualization

In [5]:
unique, counts = np.unique(handler.labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.bar(unique,counts)
plt.title("Class Distribution")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 78, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136}


We notice that there is class imbalance and we can use *SMOTE(Synthetic Minority Oversampling Technique)* for increasing instances of the minority class
We also can make use of downsampling to make sure each class is represented equally

In [6]:
print("Number of samples:",len(data))
print("Number of genes:",len(data[0]))

Number of samples: 801
Number of genes: 20531


#### Feature selection and Dimensionality reduction

In [7]:
#with pca extract eigen pairs that explains 95% of the variance in the data.
pca = PCA(n_components=0.95)

#simultanously calculate eigen pairs and transform our data into the new coordinate frame
principalComponents = pca.fit_transform(data)

#check the amount of dimensions left after pca
print(principalComponents.shape)


#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(handler.labels)

(801, 530)


In [8]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(2,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying PCA")
ax.scatter(principalComponents.T[0], principalComponents.T[1], principalComponents.T[2], color=color[labels])
plt.show()

In [25]:
### TO-DO
### TRY CORRELATION MATRIX

In [9]:
#split in test and train
X_train, X_test, y_train, y_test = train_test_split(principalComponents, labels, test_size=0.20)


In [10]:
"""X_embedded = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])
plt.show()"""

"X_embedded = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(data)\ncolor = np.array(['r', 'g', 'b', 'c', 'm'])\nfig = plt.figure(figsize=(12, 12))\nax = fig.add_subplot()\nax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])\nplt.show()"

### Requirements
###### Task Classification
 - Do classification on those samples
 - Classify the samples into 5 different tumour classes:BRCA,KIRC,LUAD,PRAD,COAD

###### Task Clustering
 - Cluster the samples into 5 different clusters

### Checking Accuracy without SMOTE

In [11]:

#knn results
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy over the dimension reduced data: ", acc)


KNN Accuracy over the dimension reduced data:  1.0


In [12]:
#kmeans
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Kmeans Accuracy over the dimension reduced data: ", acc)

### Applying SMOTE

- SMOTE is applied after PCA is done
- refer (https://arxiv.org/ftp/arxiv/papers/1403/1403.1949.pdf#:~:text=After%20running%20PCA%2C%20SMOTE%20resampling,after%20the%20running%20of%20PCA.)

In [14]:
oversample = SMOTE(k_neighbors=5)
resampled_data, resampled_labels = oversample.fit_resample(principalComponents, handler.labels)

In [15]:
unique, counts = np.unique(resampled_labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.figure(3)
plt.bar(unique,counts)
plt.title("Class Distribution after SMOTE")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 300, 'KIRC': 300, 'LUAD': 300, 'PRAD': 300}


### Visualize the data after applying SMOTE

In [16]:
#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(resampled_labels)

# Visualize
#plot 3 most important principle components after SMOTE(3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(4,figsize=(12, 10))
ax = fig.add_subplot(projection='3d')
ax.scatter(resampled_data.T[0], resampled_data.T[1], resampled_data.T[2], color=color[labels])
ax.set_title("Visualise data after PCA+SMOTE")
plt.show()

#### Split SMOTE resampled data into train test splits

In [17]:
#split in test and train
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(resampled_data, labels, test_size=0.20)


### Check model accuracy after SMOTE

In [22]:
#knn results
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled,y_train_resampled)
y_pred = model.predict(X_test_resampled)
acc = accuracy_score(y_test_resampled, y_pred)
print("KNN Accuracy over the dimension reduced data: ", acc)


#kmeans
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_resampled)
acc = accuracy_score(y_test_resampled, y_pred)
print("Kmeans Accuracy over the dimension reduced data: ", acc)

KNN Accuracy over the dimension reduced data:  0.98
Kmeans Accuracy over the dimension reduced data:  0.42


### Trying Grid search and cross validation

In [None]:
KNN_gs = KNeighborsClassifier()

KNN_gs = GridSearchCV(pipe, parameters,cv=10)
Dtclf_gs.fit(X_train,y_train)