# Imports

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.utils.multiclass import unique_labels

import torch
from torch import nn
from d2l import torch as d2l

# Load dataset

In [5]:
X = pd.read_csv("../../data/training_set_features.csv").drop("respondent_id", axis=1)
Y = pd.read_csv("../../data/training_set_labels.csv").drop("respondent_id", axis=1)

features = list(X)
targets = list(Y)

target = targets[0]

In [6]:
X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X, Y, test_size=0.3, random_state=42)

cat_features = [i for i in range(len(features))]
nominal_features = list(X_df_train.select_dtypes(object))
nominal_features_idx = [features.index(feature) for feature in nominal_features]
ordinal_features_idx = [feature_idx for feature_idx in cat_features if feature_idx not in nominal_features_idx]

# Preprocessing

In [7]:
def to_str(x):
    return x.astype(str)

def to_df(x):
    return pd.DataFrame(x)

In [8]:
pipeline = Pipeline(steps=[
    (
        'preprocessing',
        Pipeline(steps=[
            ('fillna', SimpleImputer(strategy='constant', fill_value="nan", copy=False)),
            ('to_str', FunctionTransformer(to_str)),
            ('encoder', ColumnTransformer(
                [('nominal', OneHotEncoder(), nominal_features_idx),
                 ('ordinal', OrdinalEncoder(), ordinal_features_idx)], remainder='passthrough')),
            ('to_df', FunctionTransformer(to_df))
        ])
    )
])

pipeline.fit(X_df_train)
X_train_processed = pipeline.transform(X_df_train)
X_test_processed = pipeline.transform(X_df_test)

In [9]:
X_train, y_train = torch.tensor(X_train_processed.values), torch.tensor(y_df_train[target].values)
X_test, y_test = torch.tensor(X_test_processed.values), torch.tensor(y_df_test[target].values)

# Training

In [19]:
num_features = len(features)
num_classes = 2

batch_size = 256
train_iter, test_iter = d2l.load_array([X_train_processed, y_train], batch_size)

class Reshape(torch.nn.Module):
    def forward(self, x):
        return x.view(-1, num_features)

net = nn.Sequential(Reshape(), nn.Linear(num_features, num_classes))

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.normal_(m.weight, std=0.01)

net.apply(init_weights)

TypeError: 'numpy.int32' object is not callable

In [11]:
loss = nn.CrossEntropyLoss()
trainer = torch.optim.SGD(net.parameters(), lr=0.1)

num_epochs = 10
train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

NameError: name 'train_ch3' is not defined

# Submission

In [82]:
X_holdout = pd.read_csv("../../data/test_set_features.csv")
X_holdout_processed = pipeline.transform(X_holdout.drop("respondent_id", axis=1))

holdout_predictions = cce.predict_proba(X_holdout_processed)

In [83]:
submission_df = pd.DataFrame(holdout_predictions, columns=['h1n1_vaccine', 'seasonal_vaccine'])
submission_df['respondent_id'] = X_holdout['respondent_id']
submission_df[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']].to_csv("../submissions/classifier_chain_gb.csv", index=False)