In [116]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier


In [82]:
LABEL = 'label_2'
OTHER_LABELS = ['label_1', 'label_3', 'label_4']
LAYERS = ['7', '8', '9', '10', '11', '12']

## Load data

In [83]:
def load_data(x_train_df, y_train_df, x_valid_df, y_valid_df, x_test_df, y_test_df):

    for layer in LAYERS:
        train_df = pd.read_csv(f"dataset/layer_{layer}_train.csv").drop(OTHER_LABELS, axis=1)
        valid_df = pd.read_csv(f"dataset/layer_{layer}_valid.csv").drop(OTHER_LABELS, axis=1)
        test_df = pd.read_csv(f"dataset/layer_{layer}_test.csv").drop(OTHER_LABELS, axis=1)

        train_df = train_df[~np.isnan(train_df[LABEL])]
        valid_df = valid_df[~np.isnan(valid_df[LABEL])]
        test_df = test_df[~np.isnan(test_df[LABEL])]

        x_train_df[layer] = train_df.drop(LABEL, axis=1)
        y_train_df[layer] = train_df[LABEL]
        x_valid_df[layer] = valid_df.drop(LABEL, axis=1)
        y_valid_df[layer] = valid_df[LABEL]
        x_test_df[layer] = test_df.drop(LABEL, axis=1)
        y_test_df[layer] = test_df[LABEL]

## Preprocess

#### Standardize

In [84]:
def standardize(scalar, x_train, x_valid, x_test, layer):
    features = [f'feature_{i}' for i in range(1, 769)]
    x_train[layer] = pd.DataFrame(scalar.fit_transform(x_train[layer]), columns=features)
    x_valid[layer] = pd.DataFrame(scalar.transform(x_valid[layer]), columns=features)
    x_test[layer] = pd.DataFrame(scalar.transform(x_test[layer]), columns=features)

#### PCA

In [85]:
def apply_PCA(x_train, x_valid, x_test, layer, threshold):
    pca = PCA(n_components=threshold, svd_solver='full')
    x_train_pca = pd.DataFrame(pca.fit_transform(x_train[layer]))
    x_valid_pca = pd.DataFrame(pca.transform(x_valid[layer]))
    x_test_pca = pd.DataFrame(pca.transform(x_test[layer]))

    x_train[layer] = x_train_pca
    x_valid[layer] = x_valid_pca
    x_test[layer] = x_test_pca

    print(f"No of features after PCA for layer {layer} = {len(pca.explained_variance_ratio_)}")

In [86]:
def l7_preprocess(x_train, x_valid, x_test, layer):
    scalar = RobustScaler()
    standardize(scalar, x_train, x_valid, x_test, layer)    
    threshold = 0.97
    apply_PCA(x_train, x_valid, x_test, layer, threshold)


def l8_preprocess(x_train, x_valid, x_test, layer):
    scalar = RobustScaler()
    standardize(scalar, x_train, x_valid, x_test, layer)
    threshold = 0.96
    apply_PCA(x_train, x_valid, x_test, layer, threshold)


def l9_preprocess(x_train, x_valid, x_test, layer):
    scalar = StandardScaler()
    standardize(scalar, x_train, x_valid, x_test, layer)
    threshold = 0.96
    apply_PCA(x_train, x_valid, x_test, layer, threshold)


def l10_preprocess(x_train, x_valid, x_test, layer):
    scalar = StandardScaler()
    standardize(scalar, x_train, x_valid, x_test, layer)
    threshold = 0.96
    apply_PCA(x_train, x_valid, x_test, layer, threshold)


def l11_preprocess(x_train, x_valid, x_test, layer):
    scalar = RobustScaler()
    standardize(scalar, x_train, x_valid, x_test, layer)
    threshold = 0.98
    apply_PCA(x_train, x_valid, x_test, layer, threshold)


def l12_preprocess(x_train, x_valid, x_test, layer):
    scalar = RobustScaler()
    standardize(scalar, x_train, x_valid, x_test, layer)
    threshold = 0.99
    apply_PCA(x_train, x_valid, x_test, layer, threshold)

In [87]:
def preprocess(x_train, x_valid, x_test, layer):
    if layer == '7':
        l7_preprocess(x_train, x_valid, x_test, layer)
    elif layer == '8':
        l8_preprocess(x_train, x_valid, x_test, layer)
    elif layer == '9':
        l9_preprocess(x_train, x_valid, x_test, layer)
    elif layer == '10':
        l10_preprocess(x_train, x_valid, x_test, layer)
    elif layer == '11':
        l11_preprocess(x_train, x_valid, x_test, layer)
    elif layer == '12':
        l12_preprocess(x_train, x_valid, x_test, layer)

## Classifires

In [88]:
def svm_classifier(x_train, y_train, x_valid, y_valid, params):

  kernel = params['kernel']
  gamma = params['gamma']
  degree = params['degree']
  class_weight = params['class_weight']
  c = params['C']
  
  if (degree):
    model = SVC(kernel=kernel, gamma=gamma, degree=degree, class_weight=class_weight, C=c)
  else: 
    model = SVC(kernel=kernel, gamma=gamma, class_weight=class_weight, C=c)
  model.fit(x_train, y_train)
  y_predict = model.predict(x_valid)
  accuracy = accuracy_score(y_valid, y_predict)

  return {"accuracy": accuracy, "predictions": y_predict, "model": model}

## Random grid search

In [89]:
def random_grid_search(model, param_dist, cv, n_iter, x_train, y_train):

  random_search = RandomizedSearchCV(
      estimator=model,
      param_distributions=param_dist,
      scoring='accuracy',
      cv=cv,
      verbose=1,
      n_jobs=-1,
      n_iter=n_iter
  )

  random_search.fit(x_train, y_train)

  return random_search

## Process

In [90]:
x_train_df = {}
y_train_df = {}
x_valid_df = {}
y_valid_df = {}
x_test_df = {}
y_test_df = {}

load_data(x_train_df, y_train_df, x_valid_df, y_valid_df, x_test_df, y_test_df)

In [91]:
for layer in LAYERS:
    preprocess(x_train_df, x_valid_df, x_test_df, layer)

No of features after PCA for layer 7 = 411
No of features after PCA for layer 8 = 349
No of features after PCA for layer 9 = 350
No of features after PCA for layer 10 = 310
No of features after PCA for layer 11 = 277
No of features after PCA for layer 12 = 280


In [92]:
params = {
    '7': {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'class_weight': 'balanced', 'C': 1},
    '8': {'kernel': 'poly', 'gamma': 10.0, 'degree': False, 'class_weight': 'balanced', 'C': 1.0},
    '9': {'kernel': 'poly', 'gamma': 100.0, 'degree': False, 'class_weight': 'balanced', 'C': 100.0},
    '10': {'kernel': 'poly', 'gamma': 0.1, 'degree': False, 'class_weight': 'balanced', 'C': 1000.0},
    '11': {'kernel': 'poly', 'gamma': 1000.0, 'degree': False, 'class_weight': 'balanced', 'C': 1.0},
    '12': {'kernel': 'poly', 'gamma': 0.1, 'degree': 1, 'class_weight': 'balanced' ,'C': 10}
}

In [93]:
history = {}

for layer in LAYERS:
    data = svm_classifier(x_train_df[layer], y_train_df[layer], x_valid_df[layer], y_valid_df[layer], params[layer])
    print(f"Accuracy for layer {layer} = {data['accuracy']}")
    history[layer] = data

Accuracy for layer 7 = 0.9116847826086957
Accuracy for layer 8 = 0.9266304347826086
Accuracy for layer 9 = 0.9347826086956522
Accuracy for layer 10 = 0.9524456521739131
Accuracy for layer 11 = 0.8464673913043478
Accuracy for layer 12 = 0.6820652173913043


#### Meta Model

In [94]:
data = {}

for layer in LAYERS:
    data[layer] = history[layer]["predictions"]

data["label"] = y_valid_df['7']

data_df = pd.DataFrame(data)

In [97]:
test_data = {}

for layer in LAYERS:
    model = history[layer]["model"]
    test_data[layer] = model.predict(x_test_df[layer])

test_data["label"] = y_test_df['7']

test_data_df = pd.DataFrame(test_data)


In [120]:
x_train_meta = data_df.drop(['label'], axis=1)
y_train_meta = data_df['label']

# x_valid_meta = test_data_df.drop(['label'], axis=1)
# y_valid_meta = test_data_df['label']

x_train_meta, x_valid_meta, y_train_meta, y_valid_meta = train_test_split(x_train_meta, y_train_meta, test_size=0.2, random_state=42)


In [121]:
meta_model_svc = SVC(kernel='linear')
meta_model_svc.fit(x_train_meta, y_train_meta)
y_predict_meta_svc = meta_model_svc.predict(x_valid_meta)
accuracy_svc = accuracy_score(y_valid_meta, y_predict_meta_svc)

print(f"Accuracy of meta model (SVC): {accuracy_svc}")

Accuracy of meta model (SVC): 0.8986486486486487


In [122]:
meta_model_random = RandomForestClassifier(n_estimators=100, random_state=42)
meta_model_random.fit(x_train_meta, y_train_meta)
y_predict_meta_random = meta_model_random.predict(x_valid_meta)
accuracy_random = accuracy_score(y_valid_meta, y_predict_meta_random)

print(f"Accuracy of meta model (RandomForest): {accuracy_random}")

Accuracy of meta model (RandomForest): 0.9527027027027027


In [123]:
meta_model_catboost = CatBoostClassifier(iterations=1000, random_seed=42)
meta_model_catboost.fit(x_train_meta, y_train_meta)
y_predict_meta_catboost = meta_model_catboost.predict(x_valid_meta)
accuracy_catboost = accuracy_score(y_valid_meta, y_predict_meta_catboost)

print(f"Accuracy of meta model (CatBoost): {accuracy_catboost}")

Learning rate set to 0.077013
0:	learn: 2.4928865	total: 7.91ms	remaining: 7.9s
1:	learn: 2.2352803	total: 13.9ms	remaining: 6.94s
2:	learn: 2.0318803	total: 38.8ms	remaining: 12.9s
3:	learn: 1.8648556	total: 45.3ms	remaining: 11.3s
4:	learn: 1.7162606	total: 57.9ms	remaining: 11.5s
5:	learn: 1.6155036	total: 65.2ms	remaining: 10.8s
6:	learn: 1.5189347	total: 91.6ms	remaining: 13s
7:	learn: 1.4408550	total: 94.2ms	remaining: 11.7s
8:	learn: 1.3832065	total: 97.5ms	remaining: 10.7s
9:	learn: 1.3144856	total: 101ms	remaining: 9.98s
10:	learn: 1.2350259	total: 104ms	remaining: 9.31s
11:	learn: 1.1725866	total: 107ms	remaining: 8.79s
12:	learn: 1.1253872	total: 109ms	remaining: 8.29s
13:	learn: 1.0816481	total: 112ms	remaining: 7.87s
14:	learn: 1.0299762	total: 115ms	remaining: 7.58s
15:	learn: 0.9818565	total: 118ms	remaining: 7.26s
16:	learn: 0.9369927	total: 121ms	remaining: 6.97s
17:	learn: 0.8996970	total: 123ms	remaining: 6.71s
18:	learn: 0.8594310	total: 126ms	remaining: 6.49s
19:	l

38:	learn: 0.4823927	total: 199ms	remaining: 4.9s
39:	learn: 0.4723764	total: 203ms	remaining: 4.88s
40:	learn: 0.4628029	total: 207ms	remaining: 4.84s
41:	learn: 0.4533253	total: 211ms	remaining: 4.81s
42:	learn: 0.4428058	total: 214ms	remaining: 4.76s
43:	learn: 0.4331588	total: 216ms	remaining: 4.7s
44:	learn: 0.4273926	total: 219ms	remaining: 4.64s
45:	learn: 0.4222604	total: 221ms	remaining: 4.59s
46:	learn: 0.4151616	total: 225ms	remaining: 4.55s
47:	learn: 0.4083167	total: 228ms	remaining: 4.52s
48:	learn: 0.4005009	total: 231ms	remaining: 4.48s
49:	learn: 0.3918919	total: 233ms	remaining: 4.43s
50:	learn: 0.3850923	total: 236ms	remaining: 4.39s
51:	learn: 0.3756823	total: 239ms	remaining: 4.35s
52:	learn: 0.3685920	total: 241ms	remaining: 4.3s
53:	learn: 0.3638950	total: 244ms	remaining: 4.27s
54:	learn: 0.3573441	total: 246ms	remaining: 4.22s
55:	learn: 0.3531061	total: 248ms	remaining: 4.19s
56:	learn: 0.3475226	total: 251ms	remaining: 4.15s
57:	learn: 0.3439428	total: 254ms	

#### Hyper-parameter tuning

In [124]:
param_dist_svc = {
    'C': np.logspace(-3, 3, 7),
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': np.logspace(-3, 3, 7),
    'class_weight': ['balanced']
}
cv = 2
n_iter = 5
svm = SVC()

random_search_svc = random_grid_search(svm, param_dist_svc, cv, n_iter, x_train_meta, y_train_meta)
best_model_svc = random_search_svc.best_estimator_
best_accuracy_svc = random_search_svc.best_score_
best_param_svc = random_search_svc.best_params_

print(best_param_svc)
print(best_accuracy_svc)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
{'kernel': 'poly', 'gamma': 1000.0, 'class_weight': 'balanced', 'C': 100.0}
0.8945578231292517


In [125]:
y_pred_meta_svc_2 = best_model_svc.predict(x_valid_meta)
accuracy = accuracy_score(y_valid_meta, y_pred_meta_svc_2)
print(f"Accuracy meta model after: {accuracy}")

Accuracy meta model after: 0.9121621621621622


In [126]:
param_dist_random = {
    'n_estimators': np.arange(50, 1001, 50),
    'max_depth': np.arange(3, 20)
}
cv = 2
n_iter = 5
random = RandomForestClassifier()

random_search_random = random_grid_search(random, param_dist_random, cv, n_iter, x_train_meta, y_train_meta)
best_model_random = random_search_random.best_estimator_
best_accuracy_random = random_search_random.best_score_
best_param_random = random_search_random.best_params_

print(best_param_random)
print(best_accuracy_random)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
{'n_estimators': 900, 'max_depth': 8}
0.9506802721088435


In [127]:
y_pred_meta_random_2 = best_model_random.predict(x_valid_meta)
accuracy = accuracy_score(y_valid_meta, y_pred_meta_random_2)
print(f"Accuracy meta model after: {accuracy}")

Accuracy meta model after: 0.9459459459459459
