In [1]:
#Required libraries
"""
!pip install imblearn
!pip install pyQt5
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install matplotlib
!pip install seeaborn
"""

'\n!pip install imblearn\n!pip install pyQt5\n!pip install numpy\n!pip install pandas\n!pip install sklearn\n!pip install matplotlib\n!pip install seeaborn\n'

In [2]:

import PyQt5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

%matplotlib qt 

import numpy as np

from dataloader import *
import preprocessing


from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import tree

#Using GridSearch to find the optimal value of K number of nearest neighbors
from sklearn.model_selection import GridSearchCV

#metrics for analysing our model
from sklearn.metrics import precision_score,accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# oversampling technique
from imblearn.over_sampling import SMOTE

#### Load the data

In [3]:
handler = Dataloader()

#### Scale data

scale all variables to have a mean of 0 and standard deviation of 1

In [4]:
data = preprocessing.scale(handler.data)

## Data Analysis and Visualization

In [5]:
print("Total number of samples/records : ",len(handler.data))
print("Maximum value in the features data",np.amax(handler.data))
print("Minimum value in the features data",np.amin(handler.data))

Total number of samples/records :  801
Maximum value in the features data 20.7788287118
Minimum value in the features data 0.0


In [6]:
unique, counts = np.unique(handler.labels, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))
plt.bar(unique,counts)
plt.title("Class Distribution")
plt.ylabel("Frequency")
plt.xlabel("Different types of tumour")
plt.show()

Classes and number of instances for each class:
{'BRCA': 300, 'COAD': 78, 'KIRC': 146, 'LUAD': 141, 'PRAD': 136}


We notice that there is class imbalance and we can use *SMOTE(Synthetic Minority Oversampling Technique)* for increasing instances of the minority class

In [7]:
print("Number of samples:",len(data))
print("Number of genes:",len(data[0]))

Number of samples: 801
Number of genes: 20531


In [8]:
#creating labelEncoder
le = preprocessing.LabelEncoder()
labels = le.fit_transform(handler.labels)

In [9]:
X_embedded = TSNE(n_components=2,init='random').fit_transform(handler.data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(2, figsize=(12, 12))
ax = fig.add_subplot()
ax.set_title("TSNE plot on original data")
ax.scatter(X_embedded.T[0], X_embedded.T[1], color=color[labels])
plt.show()



#### Split original dataset into train and test

In [10]:
#split  in test and train
X_train, X_test, y_train, y_test = train_test_split(handler.data, labels, test_size=0.20)

## Feature selection and Dimensionality reduction

In [11]:
#with pca extract eigen pairs that explains 95% of the variance in the data.
pca = PCA(n_components=0.95)

#SVD dimension reduction
svd = TruncatedSVD(n_components=531, random_state=0)

#simultanously calculate eigen pairs and transform our data into the new coordinate frame
principalComponents = pca.fit_transform(data)
svd_reduced_data = svd.fit_transform(data)

#check the amount of dimensions left after pca
print(principalComponents.shape)
print("Number of genes after dimension reduction using PCA:",principalComponents.shape[1])
print("Explained variance of new dataset using PCA:",pca.explained_variance_ratio_.sum())
print("")
print(svd_reduced_data.shape)
print("Number of genes after dimension reduction using SVD:",svd_reduced_data.shape[1])
print("Explained variance of new dataset using SVD:",svd.explained_variance_ratio_.sum())



(801, 530)
Number of genes after dimension reduction using PCA: 530
Explained variance of new dataset using PCA: 0.9501213818001684

(801, 531)
Number of genes after dimension reduction using SVD: 531
Explained variance of new dataset using SVD: 0.9492336720567969


In [12]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(3,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying PCA")
ax.scatter(principalComponents.T[0], principalComponents.T[1], principalComponents.T[2], color=color[labels])
plt.show()

In [13]:
# Visualize
#plot 3 most important principle components (3D plot)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(4,figsize=(12, 12))
ax = fig.add_subplot(projection='3d')
ax.set_title("Visualise data after applying SVD")
ax.scatter(svd_reduced_data.T[0], svd_reduced_data.T[1], svd_reduced_data.T[2], color=color[labels])
plt.show()

In [14]:
#split  in test and train
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(principalComponents, labels, test_size=0.20)

#split  in test and train
X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(svd_reduced_data, labels, test_size=0.20)

In [15]:
plt.figure(5, figsize=(12, 12))
ax = sn.heatmap(handler.data)
ax.set_title("Correlation heatmap of all dimensions on the original data")

#plt.figure(6, figsize=(12, 12))
#ax = sn.heatmap(principalComponents)
#ax.set_title("Correlation heatmap of all dimensions after PCA")

#plt.figure(7, figsize=(12, 12))
#ax = sn.heatmap(svd_reduced_data)
#ax.set_title("Correlation heatmap of all dimensions after SVD")


Text(0.5, 1.0, 'Correlation heatmap of all dimensions on the original data')

## Augmentation (SMOTE)

- As noticed from the classs distribution, there is class imbalance since there are very few samples in the 'COAD' class.
- We apply Synthetic Minority Oversampling Technique to oversample the classes such that each class has equal number of samples
- SMOTE is applied after feature extraction/reduction is done
- refer (https://arxiv.org/ftp/arxiv/papers/1403/1403.1949.pdf#:~:text=After%20running%20PCA%2C%20SMOTE%20resampling,after%20the%20running%20of%20PCA.)

In [16]:
oversample_pca = SMOTE(k_neighbors=5)
resampled_data_pca, resampled_labels_pca = oversample_pca.fit_resample(principalComponents, labels)

In [17]:
oversample_svd = SMOTE(k_neighbors=5)
resampled_data_svd, resampled_labels_svd = oversample_svd.fit_resample(svd_reduced_data, labels)

In [18]:
oversample = SMOTE(k_neighbors=5)
resampled_data, resampled_labels = oversample.fit_resample(handler.data, labels)

In [19]:
unique, counts = np.unique(resampled_labels_pca, return_counts=True)
print("Classes and number of instances for each class:")
print(dict(zip(unique, counts)))

Classes and number of instances for each class:
{0: 300, 1: 300, 2: 300, 3: 300, 4: 300}


In [20]:
X_embedded_pca = TSNE(n_components=2,init='random').fit_transform(resampled_data_pca)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(8, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_pca.T[0], X_embedded_pca.T[1], color=color[resampled_labels_pca])
ax.set_title("TSNE visualisation after PCA+SMOTE")
plt.show()

X_embedded_svd = TSNE(n_components=2,init='random').fit_transform(resampled_data_svd)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(9, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_svd.T[0], X_embedded_svd.T[1], color=color[resampled_labels_svd])
ax.set_title("TSNE visualisation after SVD+SMOTE")
plt.show()

X_embedded_original = TSNE(n_components=2,init='random').fit_transform(resampled_data)
color = np.array(['r', 'g', 'b', 'c', 'm'])
fig = plt.figure(10, figsize=(12, 12))
ax = fig.add_subplot()
ax.scatter(X_embedded_original.T[0], X_embedded_original.T[1], color=color[resampled_labels])
ax.set_title("TSNE visualisation after SMOTE on original data")
plt.show()



In [21]:
#split in test and train
X_train_resampled_pca, X_test_resampled_pca, y_train_resampled_pca, y_test_resampled_pca = train_test_split(resampled_data_pca, resampled_labels_pca, test_size=0.20)

X_train_resampled_svd, X_test_resampled_svd, y_train_resampled_svd, y_test_resampled_svd = train_test_split(resampled_data_svd, resampled_labels_svd, test_size=0.20)

X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(resampled_data, resampled_labels, test_size=0.20)

## Classification

After the data reduction and augmentation step we have 4 datasets:

- Original data
- Original data + SMOTE
- Original data + PCA + SMOTE
- Original data + SVD + SMOTE

 
Next, we perform classification on these data on the following models 
- KNN: This is a baseline model
- Decision tree 
- Random forest

In [22]:
#knn results on original data
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy(original data): ", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on PCA+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled_pca,y_train_resampled_pca)
y_pred = model.predict(X_test_pca)
acc = accuracy_score(y_test_pca, y_pred)
print("KNN Accuracy after PCA+SMOTE:", acc)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on SVD+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled_svd,y_train_resampled_svd)
y_pred = model.predict(X_test_svd)
acc = accuracy_score(y_test_svd, y_pred)
print("KNN Accuracy after SVD+SMOTE:", acc)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

#knn results on ORIGINAL+SMOTE
model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train_resampled,y_train_resampled)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("KNN Accuracy after ORIGINAL+SMOTE:", acc)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))
print("----------------")
print("----------------")

KNN Accuracy(original data):  0.9875776397515528
[[56  0  0  0  0]
 [ 0 23  0  1  0]
 [ 0  0 33  0  0]
 [ 1  0  0 25  0]
 [ 0  0  0  0 22]]
Classification report(ORIGINAL)
              precision    recall  f1-score   support

        BRCA       0.98      1.00      0.99        56
        COAD       1.00      0.96      0.98        24
        KIRC       1.00      1.00      1.00        33
        LUAD       0.96      0.96      0.96        26
        PRAD       1.00      1.00      1.00        22

    accuracy                           0.99       161
   macro avg       0.99      0.98      0.99       161
weighted avg       0.99      0.99      0.99       161

----------------
----------------
KNN Accuracy after PCA+SMOTE: 0.9813664596273292
[[58  0  0  3  0]
 [ 0 23  0  0  0]
 [ 0  0 28  0  0]
 [ 0  0  0 25  0]
 [ 0  0  0  0 24]]
Classification report(PCA+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      0.95      0.97        61
        COAD       1.00

In [23]:
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_svd, y_train_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))


steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_pca, y_train_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

steps_ORIGINAL = [('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.970 (0.018)
[[62  0  1  0  2]
 [ 0 18  0  2  0]
 [ 1  1 29  0  0]
 [ 4  1  0 18  0]
 [ 0  1  0  0 21]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.93      0.95      0.94        65
        COAD       0.86      0.90      0.88        20
        KIRC       0.97      0.94      0.95        31
        LUAD       0.90      0.78      0.84        23
        PRAD       0.91      0.95      0.93        22

    accuracy                           0.92       161
   macro avg       0.91      0.91      0.91       161
weighted avg       0.92      0.92      0.92       161

Accuracy: 0.972 (0.012)
[[57  0  1  3  0]
 [ 0 23  0  0  0]
 [ 1  0 27  0  0]
 [ 1  0  0 23  1]
 [ 1  0  0  0 23]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.95      0.93      0.94        61
        COAD       1.00      1.00      1.00        23
        KIRC       0.96      0.96      0.96        28
        LU

In [24]:
#SVD + SMOTE
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_svd , y_train_resampled_svd, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_svd, y_train_resampled_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#PCA + SMOTE
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_pca , y_train_resampled_pca, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_pca, y_train_resampled_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#ORIGINAL + SMOTE
steps_ORIGINAL = [('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled , y_train_resampled, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.943 (0.027)
[[63  0  1  1  0]
 [ 0 20  0  0  0]
 [ 0  0 30  1  0]
 [ 0  1  0 22  0]
 [ 0  0  0  0 22]]
Classification report(SVD+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      0.97      0.98        65
        COAD       0.95      1.00      0.98        20
        KIRC       0.97      0.97      0.97        31
        LUAD       0.92      0.96      0.94        23
        PRAD       1.00      1.00      1.00        22

    accuracy                           0.98       161
   macro avg       0.97      0.98      0.97       161
weighted avg       0.98      0.98      0.98       161

Accuracy: 0.968 (0.010)
[[57  0  2  1  1]
 [ 0 22  0  1  0]
 [ 0  0 28  0  0]
 [ 0  0  0 25  0]
 [ 0  0  0  0 24]]
Classification report(PCA+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      0.93      0.97        61
        COAD       1.00      0.96      0.98        23
        KIRC       0.93      1.00      0.97        2

### Random Forest

In [25]:
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_svd, y_train_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train_pca, y_train_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

steps_ORIGINAL = [('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train , y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.977 (0.014)
[[65  0  0  0  0]
 [ 2 18  0  0  0]
 [ 3  0 28  0  0]
 [ 4  0  0 18  1]
 [ 1  0  0  0 21]]
Classification report(SVD)
              precision    recall  f1-score   support

        BRCA       0.87      1.00      0.93        65
        COAD       1.00      0.90      0.95        20
        KIRC       1.00      0.90      0.95        31
        LUAD       1.00      0.78      0.88        23
        PRAD       0.95      0.95      0.95        22

    accuracy                           0.93       161
   macro avg       0.96      0.91      0.93       161
weighted avg       0.94      0.93      0.93       161

Accuracy: 0.989 (0.004)
[[61  0  0  0  0]
 [ 0 23  0  0  0]
 [ 1  0 27  0  0]
 [ 1  0  0 24  0]
 [ 0  0  0  0 24]]
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.97      1.00      0.98        61
        COAD       1.00      1.00      1.00        23
        KIRC       1.00      0.96      0.98        28
        LU

In [26]:
#SVD + SMOTE
steps_SVD = [('svd', TruncatedSVD(n_components=530)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_SVD)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_svd , y_train_resampled_svd, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_svd, y_train_resampled_svd)
y_pred = model.predict(X_test_svd)
print(confusion_matrix(y_test_svd, y_pred))
print("Classification report(SVD+SMOTE)")
print(classification_report(y_test_svd, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#PCA + SMOTE
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_PCA)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled_pca , y_train_resampled_pca, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled_pca, y_train_resampled_pca)
y_pred = model.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Classification report(PCA+SMOTE)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

#ORIGINAL + SMOTE
steps_ORIGINAL = [('tree', RandomForestClassifier())]
model = Pipeline(steps=steps_ORIGINAL)
cv = KFold(n_splits=5, shuffle=True)
n_scores = cross_val_score(model, X_train_resampled , y_train_resampled, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("Classification report(ORIGINAL+SMOTE)")
print(classification_report(y_test, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

Accuracy: 0.988 (0.007)
[[65  0  0  0  0]
 [ 0 20  0  0  0]
 [ 0  0 31  0  0]
 [ 0  0  0 23  0]
 [ 0  0  0  0 22]]
Classification report(SVD+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        65
        COAD       1.00      1.00      1.00        20
        KIRC       1.00      1.00      1.00        31
        LUAD       1.00      1.00      1.00        23
        PRAD       1.00      1.00      1.00        22

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161

Accuracy: 0.988 (0.007)
[[61  0  0  0  0]
 [ 0 23  0  0  0]
 [ 0  0 28  0  0]
 [ 0  0  0 25  0]
 [ 0  0  0  0 24]]
Classification report(PCA+SMOTE)
              precision    recall  f1-score   support

        BRCA       1.00      1.00      1.00        61
        COAD       1.00      1.00      1.00        23
        KIRC       1.00      1.00      1.00        2

## Clustering

In [27]:
#kmeans results on original data
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_pca)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_pca)
print("K_means clustering(PCA+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_pca, y_pred))

#kmeans results on SVD
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_svd)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_svd)
print("K_means clustering(SVD+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_svd, y_pred))

K_means clustering(original data) Normalised mutual info score:  0.9530128471386684
K_means clustering(PCA+SMOTE) Normalised mutual info score:  0.8974956952091451
K_means clustering(SVD+SMOTE) Normalised mutual info score:  0.8597225086679011


In [28]:
#kmeans results on original data + SMOTE
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test)
print("K_means clustering(original data) Normalised mutual info score: ", normalized_mutual_info_score(y_test, y_pred))

#kmeans results on PCA+SMOTE
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled_pca)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_pca)
print("K_means clustering(PCA+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_pca, y_pred))

#kmeans results on SVD+SMOTE
kmeans = KMeans(n_clusters=len(np.unique(handler.labels)), random_state=0).fit(X_train_resampled_svd)
labels_kmeans = le.fit_transform(kmeans.labels_)
y_pred = kmeans.predict(X_test_svd)
print("K_means clustering(SVD+SMOTE) Normalised mutual info score: ", normalized_mutual_info_score(y_test_svd, y_pred))

K_means clustering(original data) Normalised mutual info score:  0.9643604827827398
K_means clustering(PCA+SMOTE) Normalised mutual info score:  0.8907128452927723
K_means clustering(SVD+SMOTE) Normalised mutual info score:  0.9491267562297167


### Results
- Classification : we see good result with PCA + SMOTE 
- Clustering: we see good result with SVD+SMOTE

## Grid search


In [29]:
steps_KNN_pca = [('pca', PCA(n_components=0.95)), ('KNN', KNeighborsClassifier())]
model_KNN_gs_pca = Pipeline(steps=steps_KNN_pca)
parameters = {'KNN__n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
KNN_gs_pca = GridSearchCV(model_KNN_gs_pca, parameters,cv=10, verbose = 3)
KNN_gs_pca.fit(X_train_resampled_pca,y_train_resampled_pca)

Fitting 10 folds for each of 11 candidates, totalling 110 fits
[CV 1/10] END ...............KNN__n_neighbors=1;, score=1.000 total time=   0.2s
[CV 2/10] END ...............KNN__n_neighbors=1;, score=0.983 total time=   0.2s
[CV 3/10] END ...............KNN__n_neighbors=1;, score=0.992 total time=   0.2s
[CV 4/10] END ...............KNN__n_neighbors=1;, score=1.000 total time=   0.2s
[CV 5/10] END ...............KNN__n_neighbors=1;, score=1.000 total time=   0.2s
[CV 6/10] END ...............KNN__n_neighbors=1;, score=1.000 total time=   0.2s
[CV 7/10] END ...............KNN__n_neighbors=1;, score=0.992 total time=   0.2s
[CV 8/10] END ...............KNN__n_neighbors=1;, score=1.000 total time=   0.2s
[CV 9/10] END ...............KNN__n_neighbors=1;, score=1.000 total time=   0.2s
[CV 10/10] END ..............KNN__n_neighbors=1;, score=1.000 total time=   0.2s
[CV 1/10] END ...............KNN__n_neighbors=3;, score=1.000 total time=   0.2s
[CV 2/10] END ...............KNN__n_neighbors=

[CV 2/10] END ..............KNN__n_neighbors=21;, score=1.000 total time=   0.2s
[CV 3/10] END ..............KNN__n_neighbors=21;, score=0.992 total time=   0.2s
[CV 4/10] END ..............KNN__n_neighbors=21;, score=0.992 total time=   0.2s
[CV 5/10] END ..............KNN__n_neighbors=21;, score=1.000 total time=   0.2s
[CV 6/10] END ..............KNN__n_neighbors=21;, score=1.000 total time=   0.2s
[CV 7/10] END ..............KNN__n_neighbors=21;, score=0.992 total time=   0.2s
[CV 8/10] END ..............KNN__n_neighbors=21;, score=1.000 total time=   0.2s
[CV 9/10] END ..............KNN__n_neighbors=21;, score=0.992 total time=   0.2s
[CV 10/10] END .............KNN__n_neighbors=21;, score=0.992 total time=   0.2s


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('pca', PCA(n_components=0.95)),
                                       ('KNN', KNeighborsClassifier())]),
             param_grid={'KNN__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
                                              21]},
             verbose=3)

In [30]:
print(KNN_gs_pca.best_params_)

{'KNN__n_neighbors': 1}


In [38]:
#gridsearch for decision tree classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('tree', DecisionTreeClassifier())]
model = Pipeline(steps=steps_PCA)
parameters = {'tree__criterion':('entropy', 'gini'), 'tree__max_depth':[2,4,6,8,10,12,15,18,20], 'tree__max_features': ('sqrt', 'log2', None)}
pca_pipeline = GridSearchCV(model, parameters, verbose = 3)
pca_pipeline.fit(X_train_resampled_pca, y_train_resampled_pca)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=sqrt;, score=0.275 total time=   0.2s
[CV 2/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=sqrt;, score=0.358 total time=   0.2s
[CV 3/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=sqrt;, score=0.379 total time=   0.2s
[CV 4/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=sqrt;, score=0.475 total time=   0.2s
[CV 5/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=sqrt;, score=0.279 total time=   0.2s
[CV 1/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=log2;, score=0.388 total time=   0.2s
[CV 2/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=log2;, score=0.221 total time=   0.2s
[CV 3/5] END tree__criterion=entropy, tree__max_depth=2, tree__max_features=log2;, score=0.388 total time=   0.2s
[CV 4/5] END tree__criteri

[CV 3/5] END tree__criterion=entropy, tree__max_depth=10, tree__max_features=None;, score=0.950 total time=   0.5s
[CV 4/5] END tree__criterion=entropy, tree__max_depth=10, tree__max_features=None;, score=0.946 total time=   0.5s
[CV 5/5] END tree__criterion=entropy, tree__max_depth=10, tree__max_features=None;, score=0.971 total time=   0.5s
[CV 1/5] END tree__criterion=entropy, tree__max_depth=12, tree__max_features=sqrt;, score=0.579 total time=   0.2s
[CV 2/5] END tree__criterion=entropy, tree__max_depth=12, tree__max_features=sqrt;, score=0.746 total time=   0.2s
[CV 3/5] END tree__criterion=entropy, tree__max_depth=12, tree__max_features=sqrt;, score=0.821 total time=   0.2s
[CV 4/5] END tree__criterion=entropy, tree__max_depth=12, tree__max_features=sqrt;, score=0.767 total time=   0.2s
[CV 5/5] END tree__criterion=entropy, tree__max_depth=12, tree__max_features=sqrt;, score=0.821 total time=   0.2s
[CV 1/5] END tree__criterion=entropy, tree__max_depth=12, tree__max_features=log

[CV 5/5] END tree__criterion=gini, tree__max_depth=2, tree__max_features=log2;, score=0.446 total time=   0.1s
[CV 1/5] END tree__criterion=gini, tree__max_depth=2, tree__max_features=None;, score=0.571 total time=   0.2s
[CV 2/5] END tree__criterion=gini, tree__max_depth=2, tree__max_features=None;, score=0.592 total time=   0.2s
[CV 3/5] END tree__criterion=gini, tree__max_depth=2, tree__max_features=None;, score=0.575 total time=   0.2s
[CV 4/5] END tree__criterion=gini, tree__max_depth=2, tree__max_features=None;, score=0.575 total time=   0.2s
[CV 5/5] END tree__criterion=gini, tree__max_depth=2, tree__max_features=None;, score=0.575 total time=   0.2s
[CV 1/5] END tree__criterion=gini, tree__max_depth=4, tree__max_features=sqrt;, score=0.504 total time=   0.2s
[CV 2/5] END tree__criterion=gini, tree__max_depth=4, tree__max_features=sqrt;, score=0.487 total time=   0.2s
[CV 3/5] END tree__criterion=gini, tree__max_depth=4, tree__max_features=sqrt;, score=0.396 total time=   0.2s
[

[CV 4/5] END tree__criterion=gini, tree__max_depth=12, tree__max_features=log2;, score=0.508 total time=   0.2s
[CV 5/5] END tree__criterion=gini, tree__max_depth=12, tree__max_features=log2;, score=0.637 total time=   0.2s
[CV 1/5] END tree__criterion=gini, tree__max_depth=12, tree__max_features=None;, score=0.963 total time=   0.3s
[CV 2/5] END tree__criterion=gini, tree__max_depth=12, tree__max_features=None;, score=0.967 total time=   0.3s
[CV 3/5] END tree__criterion=gini, tree__max_depth=12, tree__max_features=None;, score=0.958 total time=   0.3s
[CV 4/5] END tree__criterion=gini, tree__max_depth=12, tree__max_features=None;, score=0.963 total time=   0.3s
[CV 5/5] END tree__criterion=gini, tree__max_depth=12, tree__max_features=None;, score=0.975 total time=   0.3s
[CV 1/5] END tree__criterion=gini, tree__max_depth=15, tree__max_features=sqrt;, score=0.671 total time=   0.2s
[CV 2/5] END tree__criterion=gini, tree__max_depth=15, tree__max_features=sqrt;, score=0.733 total time=

GridSearchCV(estimator=Pipeline(steps=[('pca', PCA(n_components=0.95)),
                                       ('tree', DecisionTreeClassifier())]),
             param_grid={'tree__criterion': ('entropy', 'gini'),
                         'tree__max_depth': [2, 4, 6, 8, 10, 12, 15, 18, 20],
                         'tree__max_features': ('sqrt', 'log2', None)},
             verbose=3)

In [40]:
plt.figure(11)
tree.plot_tree(pca_pipeline.best_estimator_['tree'],filled=True, fontsize=5)

[Text(0.5369318181818182, 0.9444444444444444, 'X[1] <= 44.052\ngini = 0.8\nsamples = 1200\nvalue = [238, 242, 242, 240, 238]'),
 Text(0.5066287878787878, 0.8333333333333334, 'X[0] <= -43.028\ngini = 0.752\nsamples = 965\nvalue = [238, 242, 242, 240, 3]'),
 Text(0.3522727272727273, 0.7222222222222222, 'X[1] <= -3.072\ngini = 0.034\nsamples = 229\nvalue = [4, 0, 225, 0, 0]'),
 Text(0.32196969696969696, 0.6111111111111112, 'gini = 0.0\nsamples = 225\nvalue = [0, 0, 225, 0, 0]'),
 Text(0.38257575757575757, 0.6111111111111112, 'gini = 0.0\nsamples = 4\nvalue = [4, 0, 0, 0, 0]'),
 Text(0.6609848484848485, 0.7222222222222222, 'X[0] <= 43.915\ngini = 0.684\nsamples = 736\nvalue = [234, 242, 17, 240, 3]'),
 Text(0.4431818181818182, 0.6111111111111112, 'X[4] <= 0.177\ngini = 0.565\nsamples = 503\nvalue = [231, 15, 17, 237, 3]'),
 Text(0.25757575757575757, 0.5, 'X[6] <= 22.352\ngini = 0.22\nsamples = 242\nvalue = [9, 15, 2, 213, 3]'),
 Text(0.15151515151515152, 0.3888888888888889, 'X[291] <= 6.61

In [41]:
print(pca_pipeline.best_params_)
print(pca_pipeline.score(X_test_pca, y_test_pca))

{'tree__criterion': 'gini', 'tree__max_depth': 8, 'tree__max_features': None}
0.968944099378882


In [34]:
#print(pca_pipeline.cv_results_['mean_test_score'])

In [35]:
#gridsearch for forest classifier
steps_PCA = [('pca', PCA(n_components=0.95)), ('forest', RandomForestClassifier(n_jobs=-1))]
model = Pipeline(steps=steps_PCA)
parameters = {'forest__criterion':('entropy', 'gini'), 'forest__max_depth':[2,4,6,8,10,12,15,18,20], 'forest__max_features': ('auto', 'sqrt', 'log2', None)}
pca_pipeline_forest = GridSearchCV(model, parameters, verbose = 3)
pca_pipeline_forest.fit(X_train_resampled_pca, y_train_resampled_pca)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=auto;, score=0.950 total time=   0.5s
[CV 2/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=auto;, score=0.979 total time=   0.5s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=auto;, score=0.971 total time=   0.6s
[CV 4/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=auto;, score=0.946 total time=   0.5s
[CV 5/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=auto;, score=0.950 total time=   0.5s
[CV 1/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=sqrt;, score=0.958 total time=   0.6s
[CV 2/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=sqrt;, score=0.958 total time=   0.6s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=2, forest__max_features=sqrt;, score=0.9

[CV 4/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=sqrt;, score=0.992 total time=   0.6s
[CV 5/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=sqrt;, score=1.000 total time=   0.7s
[CV 1/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=log2;, score=1.000 total time=   0.6s
[CV 2/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=log2;, score=0.979 total time=   0.6s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=log2;, score=0.996 total time=   0.6s
[CV 4/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=log2;, score=0.975 total time=   0.6s
[CV 5/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=log2;, score=0.983 total time=   0.5s
[CV 1/5] END forest__criterion=entropy, forest__max_depth=8, forest__max_features=None;, score=0.971 total time=   2.6s
[CV 2/5] END forest__criterion=entropy, 

[CV 2/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=None;, score=0.979 total time=   2.7s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=None;, score=0.983 total time=   2.9s
[CV 4/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=None;, score=0.971 total time=   2.7s
[CV 5/5] END forest__criterion=entropy, forest__max_depth=15, forest__max_features=None;, score=0.988 total time=   2.7s
[CV 1/5] END forest__criterion=entropy, forest__max_depth=18, forest__max_features=auto;, score=0.996 total time=   0.8s
[CV 2/5] END forest__criterion=entropy, forest__max_depth=18, forest__max_features=auto;, score=0.992 total time=   0.7s
[CV 3/5] END forest__criterion=entropy, forest__max_depth=18, forest__max_features=auto;, score=0.992 total time=   0.8s
[CV 4/5] END forest__criterion=entropy, forest__max_depth=18, forest__max_features=auto;, score=0.992 total time=   0.7s
[CV 5/5] END forest__criterion=e

[CV 1/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=sqrt;, score=0.963 total time=   0.5s
[CV 2/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=sqrt;, score=0.988 total time=   0.5s
[CV 3/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=sqrt;, score=0.971 total time=   0.6s
[CV 4/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=sqrt;, score=0.963 total time=   0.5s
[CV 5/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=sqrt;, score=0.996 total time=   0.5s
[CV 1/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=log2;, score=0.967 total time=   0.6s
[CV 2/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=log2;, score=0.933 total time=   0.5s
[CV 3/5] END forest__criterion=gini, forest__max_depth=4, forest__max_features=log2;, score=0.958 total time=   0.6s
[CV 4/5] END forest__criterion=gini, forest__max_depth=4, forest

[CV 1/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=None;, score=0.971 total time=   1.5s
[CV 2/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=None;, score=0.979 total time=   1.5s
[CV 3/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=None;, score=0.975 total time=   1.6s
[CV 4/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=None;, score=0.967 total time=   1.5s
[CV 5/5] END forest__criterion=gini, forest__max_depth=10, forest__max_features=None;, score=0.975 total time=   1.5s
[CV 1/5] END forest__criterion=gini, forest__max_depth=12, forest__max_features=auto;, score=0.988 total time=   0.6s
[CV 2/5] END forest__criterion=gini, forest__max_depth=12, forest__max_features=auto;, score=0.983 total time=   0.6s
[CV 3/5] END forest__criterion=gini, forest__max_depth=12, forest__max_features=auto;, score=0.992 total time=   0.6s
[CV 4/5] END forest__criterion=gini, forest__max_depth=1

[CV 1/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=sqrt;, score=0.988 total time=   0.5s
[CV 2/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=sqrt;, score=0.988 total time=   0.5s
[CV 3/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=sqrt;, score=0.988 total time=   0.6s
[CV 4/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=sqrt;, score=0.996 total time=   0.5s
[CV 5/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=sqrt;, score=0.996 total time=   0.5s
[CV 1/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=log2;, score=0.983 total time=   0.5s
[CV 2/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=log2;, score=0.963 total time=   0.5s
[CV 3/5] END forest__criterion=gini, forest__max_depth=20, forest__max_features=log2;, score=0.996 total time=   0.6s
[CV 4/5] END forest__criterion=gini, forest__max_depth=2

GridSearchCV(estimator=Pipeline(steps=[('pca', PCA(n_components=0.95)),
                                       ('forest',
                                        RandomForestClassifier(n_jobs=-1))]),
             param_grid={'forest__criterion': ('entropy', 'gini'),
                         'forest__max_depth': [2, 4, 6, 8, 10, 12, 15, 18, 20],
                         'forest__max_features': ('auto', 'sqrt', 'log2',
                                                  None)},
             verbose=3)

In [36]:
print(pca_pipeline_forest.best_params_)
print(pca_pipeline_forest.score(X_test_pca, y_test_pca))

{'forest__criterion': 'entropy', 'forest__max_depth': 10, 'forest__max_features': 'sqrt'}
1.0


### Ensemble

In [37]:
knn_tree_ensemble = VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=1)), ('tree', DecisionTreeClassifier())], voting='hard')
knn_tree_ensemble.fit(X_train_pca, y_train_pca)

y_pred = knn_tree_ensemble.predict(X_test_pca)
print(confusion_matrix(y_test_pca, y_pred))
print("Accuracy",)
print("Classification report(PCA)")
print(classification_report(y_test_pca, y_pred, target_names=['BRCA','COAD','KIRC','LUAD','PRAD']))

[[61  0  0  0  0]
 [ 0 23  0  0  0]
 [ 1  1 26  0  0]
 [ 1  1  0 23  0]
 [ 1  1  0  0 22]]
Accuracy
Classification report(PCA)
              precision    recall  f1-score   support

        BRCA       0.95      1.00      0.98        61
        COAD       0.88      1.00      0.94        23
        KIRC       1.00      0.93      0.96        28
        LUAD       1.00      0.92      0.96        25
        PRAD       1.00      0.92      0.96        24

    accuracy                           0.96       161
   macro avg       0.97      0.95      0.96       161
weighted avg       0.97      0.96      0.96       161

