In [1]:
# import libraries

import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score


In [2]:
# create a list of feature names

FEATURES = []
for i in range(1, 769):
  FEATURES.append('feature_' + str(i))


In [3]:
# create a list of label names

LABELS = ['label_1', 'label_2', 'label_3', 'label_4']


In [4]:
# read the data

df_train = pd.read_csv('../Dataset/layer_8_train.csv')
df_valid = pd.read_csv('../Dataset/layer_8_valid.csv')
df_test = pd.read_csv('../Dataset/layer_8_test.csv')


In [20]:
# store the data in a dictionary for each label

data_dict = dict()

for label in LABELS:
  data_dict[label] = dict()

  data_dict[label]['x_train'] = df_train[df_train[label].notna()][FEATURES].values
  data_dict[label]['y_train'] = df_train[df_train[label].notna()][label].values
  data_dict[label]['x_valid'] = df_valid[df_valid[label].notna()][FEATURES].values
  data_dict[label]['y_valid'] = df_valid[df_valid[label].notna()][label].values
  data_dict[label]['x_test'] = df_test[FEATURES].values


KeyboardInterrupt: 

In [6]:
# create svc models for each label

model_dict = dict()

model_dict[LABELS[0]] = SVC(C=100, gamma='scale', kernel='rbf')
model_dict[LABELS[1]] = SVC(C=30, gamma='scale', kernel='rbf')
model_dict[LABELS[2]] = SVC(C=100, gamma='scale', kernel='rbf')
model_dict[LABELS[3]] = SVC(C=30, gamma='scale', kernel='rbf')


In [7]:
# PCA for label 2, 3, 4

pca = PCA(n_components=0.95, svd_solver='full')

for label in LABELS[1:]:
  data_dict[label]['x_train'] = pca.fit_transform(data_dict[label]['x_train'])
  data_dict[label]['x_valid'] = pca.transform(data_dict[label]['x_valid'])
  data_dict[label]['x_test'] = pca.transform(data_dict[label]['x_test'])


In [8]:
# train and test the models for each label

df_test_pred = pd.DataFrame()

for label in LABELS:
  model_dict[label].fit(data_dict[label]['x_train'], data_dict[label]['y_train'])
  data_dict[label]['y_pred'] = model_dict[label].predict(data_dict[label]['x_valid'])
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], data_dict[label]['y_pred']))

  data_dict[label]['y_pred_test'] = model_dict[label].predict(data_dict[label]['x_test'])
  df_test_pred[label] = data_dict[label]['y_pred_test']


accuracy_score for label_1:  0.9746666666666667
accuracy_score for label_2:  0.9442934782608695
accuracy_score for label_3:  0.9986666666666667
accuracy_score for label_4:  0.984


In [9]:
# df_test_pred.index += 1
# df_test_pred.index.name = 'ID'
# df_test_pred.to_csv('submission_final.csv')


In [18]:
# get predictions for train data

# df_train_pred = dict()

for label in LABELS:
  data_dict[label]['y_pred_train'] = model_dict[label].predict(data_dict[label]['x_train'])

  pd.DataFrame(data_dict[label]['y_pred_train']).to_csv('layer_8_train_pred_{}.csv'.format(label), index=False)
  # df_train_pred[label] = data_dict[label]['y_pred_train']


# print(data_dict['label_1']['y_pred_train'])
# save the predictions

# df_train_pred.to_csv('layer_8_train_pred.csv', index=False)


In [19]:
print(len(data_dict['label_1']['y_pred_train']))
print(len(data_dict['label_2']['y_pred_train']))
print(len(data_dict['label_3']['y_pred_train']))
print(len(data_dict['label_4']['y_pred_train']))


28520
28040
28520
28520
