In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

In [2]:
from importlib import reload
import helper
reload(helper)

<module 'helper' from '/home/davinci/term3/ml_proj/helper.py'>

In [3]:
image_dir = './data/validation'
label_dir = './data/validation_labels'

y_MF, y_BP, y_CC = helper.load_dataset(image_dir, label_dir, cut_per_set=None)

In [4]:
y_MF.shape, y_BP.shape, y_CC.shape

((2904, 489), (2904, 1943), (2904, 320))

In [5]:
X = np.load('./data/vectorization/validation.npz', allow_pickle=True)
X = X['arr_0']

In [6]:
X_train, X_test, y_train, y_test = helper.split_dataset(X, y_BP, test_size=.3)

In [7]:
y_train.shape, y_test.shape

((2032, 1720), (872, 1720))

# Dimensionality reduction

Let's figure out better way to reduce dimensionality. We will try: PCA, FastICA and Isomap (as in HW2). We will use KNeighbours to reduce training time.

### PCA

In [None]:
from sklearn.decomposition import PCA
n_components = 256
pipeline_pca = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=n_components, random_state=42)),
    ('clf', KNeighborsClassifier(n_neighbors=3, metric='cosine'))
])

pipeline_pca.fit(X_train, y_train)
y_pred = pipeline_pca.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")

0.19974


In [None]:
n_components = y_train.shape[-1] // 2
pipeline_pca_bigger = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=n_components, random_state=42)),
    ('clf', KNeighborsClassifier(n_neighbors=3, metric='cosine'))
])

pipeline_pca_bigger.fit(X_train, y_train)
y_pred = pipeline_pca_bigger.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")

0.20126


### ICA

In [10]:
from sklearn.decomposition import FastICA
n_components = 256
pipeline_ica = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=2_000)),
    ('clf', KNeighborsClassifier(n_neighbors=3, metric='cosine'))
])

pipeline_ica.fit(X_train, y_train)
y_pred = pipeline_ica.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")



0.28014


In [11]:
n_components = y_train.shape[-1] // 2
pipeline_ica_bigger = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=3_000)),
    ('clf', KNeighborsClassifier(n_neighbors=3, metric='cosine'))
])

pipeline_ica_bigger.fit(X_train, y_train)
y_pred = pipeline_ica_bigger.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")

0.24146


### Isomap

In [None]:
from sklearn.manifold import Isomap
n_components = 256
pipeline_isomap = Pipeline([
    ('scaler', StandardScaler()),
    ('isomap', Isomap(n_components=n_components, max_iter=1_000, n_jobs=10)),
    ('clf', KNeighborsClassifier(n_neighbors=3, metric='cosine'))
])

pipeline_isomap.fit(X_train, y_train)
y_pred = pipeline_isomap.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")

0.14137


In [None]:
n_components = y_train.shape[-1] // 2
pipeline_isomap_bigger = Pipeline([
    ('scaler', StandardScaler()),
    ('isomap', Isomap(n_components=n_components, max_iter=1_000, n_jobs=10)),
    ('clf', KNeighborsClassifier(n_neighbors=3, metric='cosine'))
])

pipeline_isomap_bigger.fit(X_train, y_train)
y_pred = pipeline_isomap_bigger.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")

0.15148


That's settled then. We will use "small" FastICA.

# SVC

We will check out several SVC kernels: linear, RBF, sigmoid.

In [9]:
from sklearn.decomposition import FastICA
n_components = 256
pipeline_linear_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=3_000)),
    ('clf', MultiOutputClassifier(SVC(kernel='linear', random_state=42)))
])

pipeline_linear_svc.fit(X_train, y_train)
y_pred = pipeline_linear_svc.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")



0.12721


In [10]:
n_components = 256
pipeline_rbf_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=3_000)),
    ('clf', MultiOutputClassifier(SVC(kernel='rbf', random_state=42)))
])

pipeline_rbf_svc.fit(X_train, y_train)
y_pred = pipeline_rbf_svc.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")



0.05806


In [11]:
n_components = 256
pipeline_sigmoid_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=3_000)),
    ('clf', MultiOutputClassifier(SVC(kernel='sigmoid', random_state=42)))
])

pipeline_sigmoid_svc.fit(X_train, y_train)
y_pred = pipeline_sigmoid_svc.predict(X_test)
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")



0.12583


# Other ML models

### RF

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FastICA

n_components = 256

pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=3_000)),
    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42, n_jobs=10)))
])

pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
print(f"{helper.count_f1_max(y_pred_rf, y_test):.5f}")



0.11835


### LogReg

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import FastICA

n_components = 256

pipeline_logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=3_000)),
    ('clf', MultiOutputClassifier(LogisticRegression(random_state=42, max_iter=1_000, n_jobs=10)))
])

pipeline_logreg.fit(X_train, y_train)
y_pred_logreg = pipeline_logreg.predict(X_test)
print(f"{helper.count_f1_max(y_pred_logreg, y_test):.5f}")



0.15791


# Conclusion:
Similar differnce in results between 3 classification problems (MF, CC, BP) were shown for different models. Thus, the best model is: KNN. And the best kernel among SVC kernels is linear.

Script to get SVC results for all 3 problems is called `get_svc.py`.