In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


In [2]:
layer_dict = dict()


In [3]:
# layer 7
# need to do the preprocessing for the data

layer_dict['layer_7'] = dict()

#label1 pipeline
layer_dict['layer_7']['label_1'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=1000, gamma=0.001))
])

#label2 pipeline
layer_dict['layer_7']['label_2'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=1000))
])

#label3 pipeline
layer_dict['layer_7']['label_3'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC())
])

#label4 pipeline
layer_dict['layer_7']['label_4'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=1000, class_weight='balanced'))
])


In [4]:
# layer 8
# need to do the preprocessing for the data

layer_dict['layer_8'] = dict()

#label1 pipeline
layer_dict['layer_8']['label_1'] = Pipeline([
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=100, gamma='scale'))
])

#label2 pipeline
layer_dict['layer_8']['label_2'] = Pipeline([
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=30, gamma='scale'))
])

#label3 pipeline
layer_dict['layer_8']['label_3'] = Pipeline([
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=100, gamma='scale'))
])

#label4 pipeline
layer_dict['layer_8']['label_4'] = Pipeline([
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=30, gamma='scale'))
])


In [5]:
# layer 9
# need to do the preprocessing for the data

layer_dict['layer_9'] = dict()

#label1 pipeline
layer_dict['layer_9']['label_1'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', LogisticRegression(C=100))
])

#label2 pipeline
layer_dict['layer_9']['label_2'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', KNeighborsClassifier(n_neighbors=1, p=2))
])

#label3 pipeline
layer_dict['layer_9']['label_3'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', SVC(C=100, kernel='rbf'))
])

#label4 pipeline
layer_dict['layer_9']['label_4'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', SVC(C=1, kernel='linear'))
])


In [6]:
# layer 10
# need to do the preprocessing for the data

layer_dict['layer_10'] = dict()

#label1 pipeline
layer_dict['layer_10']['label_1'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', LogisticRegression(C=1))
])

#label2 pipeline
layer_dict['layer_10']['label_2'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', KNeighborsClassifier(n_neighbors=1, p=2))
])

#label3 pipeline
layer_dict['layer_10']['label_3'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', SVC(C=100, kernel='rbf'))
])

#label4 pipeline
layer_dict['layer_10']['label_4'] = Pipeline([
  ('scaler', StandardScaler()),
  ('clf', SVC(C=100, kernel='linear'))
])


In [7]:
# layer 11
# need to do the preprocessing for the data

layer_dict['layer_11'] = dict()

#same for all labels
for label in ['label_1', 'label_2', 'label_3', 'label_4']:
  layer_dict['layer_11'][label] = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(kernel='rbf', C=100, gamma='scale', random_state=42))
  ])


In [8]:
# layer 12
# need to do the preprocessing for the data

layer_dict['layer_12'] = dict()

# label1 pipeline
layer_dict['layer_12']['label_1'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=1500, gamma=0.001, kernel='rbf'))
])

# label2 pipeline
layer_dict['layer_12']['label_2'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=100, gamma=0.001, kernel='rbf'))
])

# label3 pipeline
layer_dict['layer_12']['label_3'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=100, gamma=0.001, kernel='rbf'))
])

# label4 pipeline
layer_dict['layer_12']['label_4'] = Pipeline([
  ('scaler', StandardScaler()),
  ('pca', PCA(n_components=0.95, svd_solver='full')),
  ('clf', SVC(C=1000, gamma='auto', class_weight='balanced'))
])


In [9]:
data_dict = dict()


In [10]:
FEATURES = []
for i in range(1, 769):
  FEATURES.append('feature_' + str(i))


In [11]:
# add the data to the data_dict from the csv files
for layer in ['layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12']:

  data_dict[layer] = dict()

  # read the data from the csv files
  df_train = pd.read_csv('../Dataset/'+layer+'_train.csv')
  df_valid = pd.read_csv('../Dataset/'+layer+'_valid.csv')
  df_test = pd.read_csv('../Dataset/'+layer+'_test.csv')

  # add the data to the data_dict
  for label in ['label_1', 'label_2', 'label_3', 'label_4']:
    data_dict[layer][label] = dict()

    data_dict[layer][label]['x_train'] = df_train[df_train[label].notna()][FEATURES].values
    data_dict[layer][label]['y_train'] = df_train[df_train[label].notna()][label].values
    data_dict[layer][label]['x_valid'] = df_valid[df_valid[label].notna()][FEATURES].values
    data_dict[layer][label]['y_valid'] = df_valid[df_valid[label].notna()][label].values
    data_dict[layer][label]['x_test'] = df_test[df_test[label].notna()][FEATURES].values


In [16]:
# train pipelines
for layer in layer_dict.keys():
  for label in layer_dict[layer].keys():
    layer_dict[layer][label].fit(data_dict[layer][label]['x_train'], data_dict[layer][label]['y_train'])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# predict the labels for the train data (used in Meta Classifier)
for layer in layer_dict.keys():
  for label in layer_dict[layer].keys():
    data_dict[layer][label]['y_train_pred'] = layer_dict[layer][label].predict(data_dict[layer][label]['x_train'])


In [18]:
# wrtie the predictions to the csv files
for label in ['label_1', 'label_2', 'label_3', 'label_4']:
  label_df = pd.DataFrame(columns=['layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12'])
  for layer in ['layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12']:
    label_df[layer] = data_dict[layer][label]['y_train_pred']
  label_df.to_csv('../Outputs/'+label+'_train_pred.csv', index=False)


In [17]:
#import xgb
from xgboost import XGBClassifier

meta_model_dict = dict()
meta_model_data_dict = dict()
df = pd.read_csv('../Dataset/layer_7_train.csv')

for label in ['label_1', 'label_2', 'label_3', 'label_4']:
  meta_model_dict[label] = SVC()
  meta_model_data_dict[label] = dict()
  meta_model_data_dict[label]['x_train'] = pd.read_csv('../Outputs/'+label+'_train_pred.csv')
  meta_model_data_dict[label]['y_train'] = df[df[label].notna()][label].values

# train the meta model for each label
for label in meta_model_dict.keys():

  print('Training Meta Model for', label)
  # new y_train for meta model
  # y_train_meta = data_dict['layer_7'][label]['y_train']
  y_train_meta = meta_model_data_dict[label]['y_train']

  print(y_train_meta.shape)
  # print(pd.DataFrame(x_train_meta).head())
  # print(pd.DataFrame(y_train_meta).head())
  # train the meta model
  meta_model_dict[label].fit(meta_model_data_dict[label]['x_train'], y_train_meta)


Training Meta Model for label_1
(28520,)
Training Meta Model for label_2
(28040,)
Training Meta Model for label_3
(28520,)
Training Meta Model for label_4
(28520,)


In [27]:
valid_predictions = dict()

# predict the labels for the valid data
for label in ['label_1', 'label_2', 'label_3', 'label_4']:
  valid_predictions[label] = dict()
  for layer in layer_dict.keys():
    valid_predictions[label][layer] = layer_dict[layer][label].predict(data_dict[layer][label]['x_valid'])

# wrtie the predictions to the csv files
for label in ['label_1', 'label_2', 'label_3', 'label_4']:
  label_df = pd.DataFrame(columns=['layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12'])
  for layer in ['layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12']:
    label_df[layer] = valid_predictions[label][layer]
  label_df.to_csv('../Outputs/'+label+'_valid_pred.csv', index=False)


In [18]:
# predict the labels for the valid data (used in Meta Classifier)
df = pd.read_csv('../Dataset/layer_7_valid.csv')
for label in ['label_1', 'label_2', 'label_3', 'label_4']:
  meta_model_data_dict[label]['x_valid'] = pd.read_csv('../Outputs/'+label+'_valid_pred.csv')
  meta_model_data_dict[label]['y_valid'] = df[df[label].notna()][label].values

  # predict the labels for the valid data
  # valid_predictions[label]['meta'] = meta_model_dict[label].predict(x_valid_meta)
  accuracy = meta_model_dict[label].score(meta_model_data_dict[label]['x_valid'], meta_model_data_dict[label]['y_valid'])
  print('Meta Classifier Accuracy for label', label, ':', accuracy)


Meta Classifier Accuracy for label label_1 : 0.8373333333333334
Meta Classifier Accuracy for label label_2 : 0.8097826086956522
Meta Classifier Accuracy for label label_3 : 0.9986666666666667
Meta Classifier Accuracy for label label_4 : 0.8826666666666667


In [None]:
final_pedictions = dict()

# predict the labels for the test data (used in Meta Classifier)
for layer in layer_dict.keys():
  for label in layer_dict[layer].keys():
    data_dict[layer][label]['y_test_pred'] = layer_dict[layer][label].predict(data_dict[layer][label]['x_test'])

# wrtie the predictions to the csv files
for label in ['label_1', 'label_2', 'label_3', 'label_4']:
  label_df = pd.DataFrame(columns=['layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12'])
  for layer in ['layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12']:
    label_df[layer] = data_dict[layer][label]['y_test_pred']
  label_df.to_csv('../Outputs/'+label+'_train_pred.csv', index=False)

# predict from meta model
for label in meta_model_dict.keys():
  # new x_test for meta model
  x_test_meta = []
  for layer in layer_dict.keys():
    x_test_meta.append(data_dict[layer][label]['y_test_pred'])
  x_test_meta = np.array(x_test_meta).T

  # predict the labels
  final_pedictions[label] = meta_model_dict[label].predict(x_test_meta)
