In [8]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tqdm import tqdm

In [2]:
!pwd

/Users/abhijithroy/Public/Projects/H1N1andSeasonalV


In [3]:
os.listdir('./')

['submission1.csv',
 'H1N1predictor.ipynb',
 'submission2.csv',
 'submission3.csv',
 'run2.ipynb',
 '.ipynb_checkpoints',
 'Run3.ipynb',
 'data']

In [4]:
# reading data
data_path = './data/'
X_train = pd.read_csv(f'{data_path}training_set_features.csv').drop('respondent_id', axis=1)
X_test = pd.read_csv(f'{data_path}test_set_features.csv').drop('respondent_id', axis=1)
y_train = pd.read_csv(f'{data_path}training_set_labels.csv').drop('respondent_id', axis=1)
submission_format = pd.read_csv(f'{data_path}submission_format.csv')

In [5]:
# divding the features into numerical and non numerical
# preprocessing for numerical involves scaling and imputing using KNN, finally! 
# (have to think about the optimal knn neighbour)
# preprocessing for non numerical/cat involves imputation with 'missing' and one hot encoding,
# what about ordinal encoding? mapping it on our own could be one way to try 

num_features = X_train.columns[X_train.dtypes != "object"]
cat_features = X_train.columns[X_train.dtypes == "object"]

num_transformer = Pipeline([
    ('scale', StandardScaler()),
    ('impute', KNNImputer(n_neighbors=10)),
])

cat_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value = 'missing')),
    ('encode', OneHotEncoder(drop='first'))
])

preprocesser = ColumnTransformer([
    ('numerical', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [13]:
def get_model(C = 1):
    model = Pipeline([
        ('pre', preprocesser),
        ('model', MultiOutputClassifier(LogisticRegression(
            penalty='l1',
            C=C,
            solver='saga' # saga? 
        )))
    ])
    return model

In [24]:
Cs = np.logspace(-2,1, num=10, base=10)
means = []
stds = []
best_auc = 0
for C in tqdm(Cs):
    cv = cross_validate(
        estimator=get_model(C),
        X=X_train,
        y=y_train,
        cv=5,
        n_jobs=-1,
        scoring='roc_auc',
    )
    means.append(np.mean(cv['test_score']))
    stds.append(np.std(cv['test_score']))
    print(f'processing for C={C}')
    if means[-1] > best_auc:
        best_C = C
        best_auc = means[-1]
        print(f'New Best AUC Score : {best_auc} for C={best_C}')
        

  0%|                                                    | 0/10 [00:00<?, ?it/s]python(5808) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(5809) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(5810) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(5811) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(5812) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 10%|████▍                                       | 1/10 [01:13<11:00, 73.38s/it]

processing for C=0.01
New Best AUC Score : 0.8359902310003999 for C=0.01


python(5844) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 20%|████████▌                                  | 2/10 [03:49<16:17, 122.21s/it]

processing for C=0.021544346900318832
New Best AUC Score : 0.838518992111848 for C=0.021544346900318832


python(6014) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 30%|████████████▉                              | 3/10 [05:48<14:05, 120.84s/it]

processing for C=0.046415888336127774
New Best AUC Score : 0.8415656950159353 for C=0.046415888336127774


 40%|█████████████████▏                         | 4/10 [07:56<12:21, 123.56s/it]

processing for C=0.1
New Best AUC Score : 0.8426722735413492 for C=0.1


python(6245) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 50%|█████████████████████▌                     | 5/10 [24:28<36:24, 436.82s/it]

processing for C=0.21544346900318834
New Best AUC Score : 0.8430769757808347 for C=0.21544346900318834


python(6364) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6365) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6366) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6367) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6368) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 60%|█████████████████████████▊                 | 6/10 [46:40<49:23, 740.99s/it]

processing for C=0.46415888336127775
New Best AUC Score : 0.843101595847074 for C=0.46415888336127775


python(6485) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6486) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 70%|██████████████████████████████             | 7/10 [50:43<28:54, 578.15s/it]

processing for C=1.0


python(6569) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 80%|████████████████████████████████▊        | 8/10 [1:07:30<23:49, 714.72s/it]

processing for C=2.154434690031882


python(6654) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6655) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6656) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
 90%|████████████████████████████████████▉    | 9/10 [1:20:51<12:21, 741.79s/it]

processing for C=4.6415888336127775


python(6749) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
100%|████████████████████████████████████████| 10/10 [1:22:11<00:00, 493.18s/it]

processing for C=10.0





In [25]:
best_auc

0.843101595847074

In [26]:
best_C

0.46415888336127775

In [27]:
means, stds

([0.8359902310003999,
  0.838518992111848,
  0.8415656950159353,
  0.8426722735413492,
  0.8430769757808347,
  0.843101595847074,
  0.8430427369232248,
  0.8429654628813299,
  0.8429157919223755,
  0.8428968304262883],
 [0.004571904569803102,
  0.004337830458229356,
  0.004196630161593488,
  0.004263008528861613,
  0.004260395375044933,
  0.004266344162345417,
  0.004265279000440527,
  0.0042738833279207045,
  0.004277829038645269,
  0.004270154281318641])