In [16]:
# import libraries

import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [17]:
# create a list of feature names

FEATURES = []
for i in range(1, 769):
  FEATURES.append('feature_' + str(i))


In [18]:
# create a list of label names

LABELS = ['label_1', 'label_2', 'label_3', 'label_4']


In [19]:
# read the data

df_train = pd.read_csv('../Dataset/layer_11_train.csv')
df_valid = pd.read_csv('../Dataset/layer_11_valid.csv')
df_test = pd.read_csv('../Dataset/layer_11_test.csv')


In [20]:
# store the data in a dictionary for each label

data_dict = dict()

for label in LABELS:
  data_dict[label] = dict()

  data_dict[label]['x_train'] = df_train[df_train[label].notna()][FEATURES].values
  data_dict[label]['y_train'] = df_train[df_train[label].notna()][label].values
  data_dict[label]['x_valid'] = df_valid[df_valid[label].notna()][FEATURES].values
  data_dict[label]['y_valid'] = df_valid[df_valid[label].notna()][label].values
  data_dict[label]['x_test'] = df_test[FEATURES].values


In [21]:
scaler = StandardScaler()

model = SVC(kernel='rbf', C=100, gamma='scale', random_state=42)

model_dict = dict()

for label in LABELS:
  model_dict[label] = SVC(kernel='rbf', C=100, gamma='scale', random_state=42)


In [22]:
# train and test the models for each label

df_test_pred = pd.DataFrame()

for label in LABELS:
  scaler.fit(data_dict[label]['x_train'])
  data_dict[label]['x_train'] = scaler.transform(data_dict[label]['x_train'])
  data_dict[label]['x_valid'] = scaler.transform(data_dict[label]['x_valid'])
  data_dict[label]['x_test'] = scaler.transform(data_dict[label]['x_test'])

  model_dict[label].fit(data_dict[label]['x_train'], data_dict[label]['y_train'])
  data_dict[label]['y_pred'] = model_dict[label].predict(data_dict[label]['x_valid'])
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], data_dict[label]['y_pred']))

  data_dict[label]['y_pred_test'] = model_dict[label].predict(data_dict[label]['x_test'])
  df_test_pred[label] = data_dict[label]['y_pred_test']


accuracy_score for label_1:  0.952
accuracy_score for label_2:  0.9225543478260869
accuracy_score for label_3:  0.9986666666666667
accuracy_score for label_4:  0.96


In [23]:
# df_test_pred.index += 1
# df_test_pred.index.name = 'ID'
# df_test_pred.to_csv('submission_final.csv')


In [24]:
for label in LABELS:
  data_dict[label]['y_pred_train'] = model_dict[label].predict(data_dict[label]['x_train'])

  pd.DataFrame(data_dict[label]['y_pred_train']).to_csv('layer_11_train_pred_{}.csv'.format(label), index=False)


In [25]:
print(len(data_dict['label_1']['y_pred_train']))
print(len(data_dict['label_2']['y_pred_train']))
print(len(data_dict['label_3']['y_pred_train']))
print(len(data_dict['label_4']['y_pred_train']))


28520
28040
28520
28520


In [26]:
data_dict['label_1']['y_pred']


array([45, 45, 45, 30, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 30,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 22, 60, 60,
       60, 60, 60, 60, 60, 60, 51, 60, 19, 19, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 19, 19, 19,  1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 52, 52, 52, 52, 52, 52, 52, 52, 52,
       52, 52, 25, 25, 25, 25, 25, 25, 51, 25, 25, 25, 25, 46, 46, 46, 46,
       46, 46, 46, 46, 46, 46, 46, 51, 51, 51, 51, 51, 51, 51, 51, 35, 35,
       35, 35, 35, 35, 35, 35, 56, 56, 56, 56, 52, 56, 56, 56, 56, 56, 53,
       53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3, 40, 40, 40, 40, 40, 40, 40, 19,
       40, 40, 40, 40, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 58,
       58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 58, 58, 58, 58, 58, 58,
       58, 58, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 23,
       44, 44, 44, 44, 37