In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from keras.models import load_model

In [3]:
#import CSV to be read as a pandas dataframe
labels = pd.read_csv('/content/drive/MyDrive/Preprocessed_BRCA_RNA_data_normalized.csv', header = 0)
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Encoded_SAEdata.csv', header = None)

#encode categorical variables to numerical
labelenc = LabelEncoder()
labels['Class'] = labelenc.fit_transform(labels['Class'])

print(data)

#get labels from original data and use encoded data as x
x = data
y = labels['Class']
print(y)

print(x.shape)
print(y.shape)

            0     1     2           3     ...        3996        3997  3998  3999
0     34624196.0   0.0   0.0  25319092.0  ...  40169164.0  50800664.0   0.0   0.0
1     35555280.0   0.0   0.0  25999470.0  ...  41248964.0  52166092.0   0.0   0.0
2     35597272.0   0.0   0.0  26029184.0  ...  41295596.0  52225976.0   0.0   0.0
3     38360768.0   0.0   0.0  28050728.0  ...  44505036.0  56281900.0   0.0   0.0
4     36705600.0   0.0   0.0  26840622.0  ...  42585956.0  53853840.0   0.0   0.0
...          ...   ...   ...         ...  ...         ...         ...   ...   ...
1090  37631308.0   0.0   0.0  27518624.0  ...  43658788.0  55213036.0   0.0   0.0
1091  35085672.0   0.0   0.0  25654976.0  ...  40702624.0  51475188.0   0.0   0.0
1092  37345780.0   0.0   0.0  27314730.0  ...  43335368.0  54796792.0   0.0   0.0
1093  39085112.0   0.0   0.0  28580688.0  ...  45344092.0  57343432.0   0.0   0.0
1094  36038656.0   0.0   0.0  26352948.0  ...  41809144.0  52874280.0   0.0   0.0

[1095 rows x 40

In [4]:
#divide into train and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(876, 4000)
(219, 4000)
(876,)
(219,)


In [5]:
#scale data & normalize
t = MinMaxScaler()
t.fit(x_train)
t.fit(x_test)

x_train = t.transform(x_train)
x_test = t.transform(x_test)

In [6]:
#set classifier to vote of 3 neighbors, weighted by distance from k
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [7]:
#compare labels to preduction from KNN model
y_expect = y_test
y_pred = knn.predict(x_test)

#get metrics, SAE
print(metrics.classification_report(y_expect, y_pred))

              precision    recall  f1-score   support

           0       0.40      0.08      0.13        25
           1       0.22      0.12      0.15        17
           2       0.82      0.95      0.88       177

    accuracy                           0.79       219
   macro avg       0.48      0.38      0.39       219
weighted avg       0.73      0.79      0.74       219

