In [None]:
# import libraries

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler


In [None]:
# create a list of feature names

FEATURES = []
for i in range(1, 769):
  FEATURES.append('feature_' + str(i))

In [None]:
# create a list of label names

LABELS = ['label_1', 'label_2', 'label_3', 'label_4']

In [None]:
# read the data

df_train = pd.read_csv('train.csv')
df_valid = pd.read_csv('valid.csv')
df_test = pd.read_csv('test.csv')

In [None]:
# check for missing values

missing_values_sum =  df_train.isnull().sum()
missing_values_sum[missing_values_sum > 0]

Label 2 has missing values, missing values need to be removed when generating the training and validation data

In [None]:
data_dict = dict()

for label in LABELS:
  data_dict[label] = dict()
  
  data_dict[label]['x_train'] = df_train[df_train[label].notna()][FEATURES].values
  data_dict[label]['y_train'] = df_train[df_train[label].notna()][label].values
  data_dict[label]['x_valid'] = df_valid[df_valid[label].notna()][FEATURES].values
  data_dict[label]['y_valid'] = df_valid[df_valid[label].notna()][label].values
  data_dict[label]['x_test'] = df_test[FEATURES].values

In [None]:
# create a function to write the predictions to a csv file

def write_to_csv(df_test, file_name):
  # index.name = ID and start index from 1

  df_test.index += 1
  df_test.index.name = 'ID'
  df_test.to_csv(file_name)

### Without any feature engineering, hyperparameter tuning

#### KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3, weights='distance', n_jobs=7)

df_test_pred_knn = pd.DataFrame()

for label in LABELS:
  knn_model.fit(data_dict[label]['x_train'], data_dict[label]['y_train'])
  knn_pred = knn_model.predict(data_dict[label]['x_valid'])
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], knn_pred))

  knn_test_pred = knn_model.predict(data_dict[label]['x_test'])
  df_test_pred_knn[label] = knn_test_pred

df_test_pred_knn.head()

In [None]:
write_to_csv(df_test_pred_knn, 'submission_initial_knn.csv')

#### SVC

In [None]:
svc_model = SVC(kernel='linear', C=1.0, random_state=42)

df_test_pred_svc = pd.DataFrame()

for label in LABELS:
  svc_model.fit(data_dict[label]['x_train'], data_dict[label]['y_train'])
  svc_pred = svc_model.predict(data_dict[label]['x_valid'])
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], svc_pred))

  svc_test_pred = svc_model.predict(data_dict[label]['x_test'])
  df_test_pred_svc[label] = svc_test_pred

df_test_pred_svc.head()

In [None]:
write_to_csv(df_test_pred_svc, 'submission_initial_svc.csv')


## Visualize the distribution of the labels

### Training data

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

for i, label in enumerate(LABELS):
    row = i // 2
    col = i % 2
    ax = axes[row, col]
    sns.countplot(data=df_train, x=label, ax=ax)
    ax.set_title(f'Distribution of {label}')
    ax.set_xlabel(label)
    ax.set_ylabel('Count')

plt.tight_layout()
plt.show()


label1 shows a balanced distribution, label3 and label4 have one dominant class each, label2 has a skewed distribution

### PCA

In [None]:
svc_model_pca = SVC(kernel='linear', C=1.0, random_state=42)

df_test_pred = pd.DataFrame()

pca = PCA(n_components=0.97,svd_solver='full')

for label in LABELS:
  x_train_pca = pca.fit_transform(data_dict[label]['x_train'])
  svc_model_pca.fit(x_train_pca, data_dict[label]['y_train'])

  x_valid_pca = pca.transform(data_dict[label]['x_valid'])
  svc_pred = svc_model_pca.predict(x_valid_pca)
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], svc_pred))

  x_test_pca = pca.transform(data_dict[label]['x_test'])
  svc_test_pred = svc_model_pca.predict(x_test_pca)
  df_test_pred[label] = svc_test_pred

df_test_pred.head()

#### SVC hyperparameter tuning

In [None]:
# halving grid search for best parameters

param_grid = {'C': [1, 10, 100]}
base_estimator = SVC(gamma='scale', kernel='rbf', random_state=42)

search = HalvingGridSearchCV(base_estimator, param_grid, cv=5, verbose=1, n_jobs=7)
search.fit(data_dict['label_1']['x_train'], data_dict['label_1']['y_train'])

print(search.best_params_)


In [None]:
svc_model_after_halving_search = SVC(kernel='rbf', gamma='scale', C=100, random_state=42)

df_test_pred_after_halving_search_svc = pd.DataFrame()

for label in LABELS:
  svc_model_after_halving_search.fit(data_dict[label]['x_train'], data_dict[label]['y_train'])

  svc_pred = svc_model_after_halving_search.predict(data_dict[label]['x_valid'])
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], svc_pred))

  svc_test_pred_after_halving_search = svc_model_after_halving_search.predict(data_dict[label]['x_test'])
  df_test_pred_after_halving_search_svc[label] = svc_test_pred_after_halving_search

df_test_pred_after_halving_search_svc.head()

In [None]:
write_to_csv(df_test_pred_after_halving_search_svc, 'submission_tune_svc.csv')


### PCA + GridSearchCV + SVM

In [None]:
# empty dataframe for test predictions
df_test_pred = pd.DataFrame()

# PCA with 95% variance
pca = PCA(n_components=0.95,svd_solver='full')

for label in LABELS:
  # PCA fit and transform on train data
  x_train_pca = pca.fit_transform(data_dict[label]['x_train'])
  # PCA transform on valid data
  x_valid_pca = pca.transform(data_dict[label]['x_valid'])
  # PCA transform on test data
  x_test_pca = pca.transform(data_dict[label]['x_test'])

  # grid search for best parameters
  param_grid = {
                'C': [1, 10, 20, 30, 40, 50, 100],
                'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                'gamma': ['scale', 'auto']
                }
  base_estimator = SVC(random_state=42)
  search = HalvingGridSearchCV(base_estimator, param_grid, cv=5, verbose=1, n_jobs=7)
  search.fit(x_train_pca, data_dict[label]['y_train'])

  print(search.best_params_)
  
  # SVC model with best parameters
  svc_model_pca_and_grid_search = SVC(**search.best_params_, random_state=42)
  # fit the model
  svc_model_pca_and_grid_search.fit(x_train_pca, data_dict[label]['y_train'])
  # predict on valid data
  svc_pred = svc_model_pca_and_grid_search.predict(x_valid_pca)
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], svc_pred))
  # predict on test data
  svc_test_pred = svc_model_pca_and_grid_search.predict(x_test_pca)
  df_test_pred[label] = svc_test_pred

df_test_pred.head()

In [None]:
write_to_csv(df_test_pred, 'submission_tune_svc_pca.csv')


### StandardScaler + SVC

In [None]:
scaler = StandardScaler()

svc_model_after_halving_search = SVC(kernel='rbf', gamma='scale', C=100, random_state=42)

df_test_pred = pd.DataFrame()

for label in LABELS:
  scaler.fit(data_dict[label]['x_train'])
  x_train = scaler.transform(data_dict[label]['x_train'])
  x_valid = scaler.transform(data_dict[label]['x_valid'])
  x_test = scaler.transform(data_dict[label]['x_test'])

  svc_model_after_halving_search.fit(x_train, data_dict[label]['y_train'])

  svc_pred = svc_model_after_halving_search.predict(x_valid)
  print('accuracy_score for {}: '.format(label), accuracy_score(data_dict[label]['y_valid'], svc_pred))

  svc_test_pred = svc_model_after_halving_search.predict(x_test)
  df_test_pred[label] = svc_test_pred

df_test_pred.head()

In [None]:
write_to_csv(df_test_pred, 'submission_scaled_svc.csv')


### Layer steps
  - Scale using StandardScaler
  - SVC(kernel='rbf', gamma='scale', C=100, random_state=42) 

**Final model for the layer in layer-11-final.ipynb**