<a href="https://colab.research.google.com/github/Elvira-Zainulina/ml_MRI_age_prediction/blob/master/Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Baseline model for classification on the image data

In [0]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.decomposition import KernelPCA

In [0]:
def load_data(path):
    data = np.load(path)
    X = data['images']
    y = data['ages']
    g = data['genders']
    return X, y, g

### Data uploading

In [0]:
path = r"drive/My Drive/Colab Notebooks/ml-project/data" #path to the folder with datasets that were created in the 

In [0]:
X_train, y_train, g_train = load_data(os.path.join(path, 'train_dataset.npy'))

In [0]:
X_val, y_val, g_val = load_data(os.path.join(path, 'val_dataset.npy'))

In [0]:
X_test, y_test, g_test = load_data(os.path.join(path, 'test_dataset.npy'))

In [0]:
X_train_full = np.vstack([X_train, X_val])
X_train_full.shape

(800, 1, 128, 128, 128)

### Data reshaping

In [0]:
X_train_full = X_train_full.reshape((X_train_full.shape[0], -1))
X_train_full.shape

(800, 2097152)

In [0]:
X_train = X_train.reshape((X_train.shape[0], -1))
X_train.shape

(700, 2097152)

In [0]:
X_val = X_val.reshape((X_val.shape[0], -1))
X_val.shape

(100, 2097152)

In [0]:
X_test = X_test.reshape((X_test.shape[0], -1))
X_test.shape

(99, 2097152)

In [0]:
g_train = g_train.reshape(-1, 1)
g_test = g_test.reshape(-1, 1)
g_val = g_val.reshape(-1, 1)

### KernelPCA determining

In [0]:
pca = KernelPCA(n_components=100, kernel='rbf', n_jobs=-1, copy_X=False, random_state=42)

In [0]:
%%time
pca.fit_transform(X_train_full)

CPU times: user 6min 37s, sys: 36.2 s, total: 7min 14s
Wall time: 2min 49s


### Obtain features via KernelPCA

In [0]:
X_train_tr = pca.transform(X_train)

In [0]:
X_val_tr = pca.transform(X_val)

In [0]:
X_test_tr = pca.transform(X_test)

## kNNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

### $4$-class classification

In [0]:
scores = []
for n in range(3, 10):
    knn = KNeighborsClassifier(n_neighbors=n, n_jobs=-1)
    knn.fit(np.hstack([X_train_tr, g_train]), y_train)
    pred = knn.predict(np.hstack([X_val_tr, g_val]))
    scores.append(f1_score(y_val, pred, average='micro'))

In [0]:
best_param = np.arange(3, 10)[np.array(scores)==max(scores)]
best_param

array([4])

In [0]:
knn = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
knn.fit(np.hstack([X_train_tr, g_train]), y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
                     weights='uniform')

In [0]:
y_train_pred = knn.predict(np.hstack([X_train_tr, g_train]))
f1_score(y_train, y_train_pred, average='micro')

0.6557142857142857

In [0]:
y_val_pred = knn.predict(np.hstack([X_val_tr, g_val]))
f1_score(y_val, y_val_pred, average='micro')

0.42999999999999994

In [0]:
y_test_pred = knn.predict(np.hstack([X_test_tr, g_test]))
f1_score(y_test, y_test_pred, average='micro')

0.45454545454545453

### $2$-class classification

Classification for $2$ classes.

In [0]:
def regroup_ages(y):
    return np.array(list(map(lambda x: 0 if x < 2 else 1, y)))

In [0]:
y_train_2 = regroup_ages(y_train)
y_val_2 = regroup_ages(y_val)
y_test_2 = regroup_ages(y_test)

In [0]:
scores = []
for n in range(3, 10):
    knn = KNeighborsClassifier(n_neighbors=n, n_jobs=-1)
    knn.fit(np.hstack([X_train_tr, g_train]), y_train_2)
    pred = knn.predict(np.hstack([X_val_tr, g_val]))
    scores.append(f1_score(y_val_2, pred, average='micro'))

In [0]:
best_param = np.arange(3, 10)[np.array(scores)==max(scores)]
best_param

(array([3]), 0.61)

In [0]:
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
knn.fit(np.hstack([X_train_tr, g_train]), y_train_2)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
                     weights='uniform')

In [0]:
y_train_pred = knn.predict(np.hstack([X_train_tr, g_train]))
f1_score(y_train_2, y_train_pred, average='micro')

0.797142857142857

In [0]:
y_val_pred = knn.predict(np.hstack([X_val_tr, g_val]))
f1_score(y_val_2, y_val_pred, average='micro')

0.61

In [0]:
y_test_pred = knn.predict(np.hstack([X_test_tr, g_test]))
f1_score(y_test_2, y_test_pred, average='micro')

0.6161616161616161