# Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.utils.multiclass import unique_labels

import torch
from torch import nn
from torch.utils import data
from d2l import torch as d2l

from IPython import display

# Load dataset

In [2]:
X = pd.read_csv("../../data/training_set_features.csv").drop("respondent_id", axis=1)
Y = pd.read_csv("../../data/training_set_labels.csv").drop("respondent_id", axis=1)

features = list(X)
targets = list(Y)

target = targets[0]

In [3]:
X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X, Y, test_size=0.3, random_state=42)

cat_features = [i for i in range(len(features))]
nominal_features = list(X_df_train.select_dtypes(object))
nominal_features_idx = [features.index(feature) for feature in nominal_features]
ordinal_features_idx = [feature_idx for feature_idx in cat_features if feature_idx not in nominal_features_idx]

# Preprocessing

In [4]:
def to_str(x):
    return x.astype(str)

def to_tensor(x):
    return torch.tensor(x.astype(np.float32))

In [5]:
y_train_seasonal = torch.tensor(y_df_train['seasonal_vaccine'].values.astype(np.float32)).long()
y_train_h1n1 = torch.tensor(y_df_train['h1n1_vaccine'].values.astype(np.float32)).long()

y_test_seasonal = torch.tensor(y_df_test['seasonal_vaccine'].values.astype(np.float32)).long()
y_test_h1n1 = torch.tensor(y_df_test['h1n1_vaccine'].values.astype(np.float32)).long()

In [6]:
pipeline = Pipeline(steps=[
    (
        'preprocessing',
        Pipeline(steps=[
            ('fillna', SimpleImputer(strategy='constant', fill_value="nan", copy=False)),
            ('to_str', FunctionTransformer(to_str)),
            ('encoder', ColumnTransformer(
                [('nominal', OneHotEncoder(), nominal_features_idx),
                 ('ordinal', OrdinalEncoder(), ordinal_features_idx)], remainder='passthrough')),
            ('to_tensor', FunctionTransformer(to_tensor))
        ])
    )
])

pipeline.fit(X_df_train)
X_train = pipeline.transform(X_df_train)
X_test = pipeline.transform(X_df_test)

# Training

In [7]:
def accuracy(y_hat, y):  #@save
    """Compute the number of correct predictions."""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.type(y.dtype) == y
    return float(d2l.reduce_sum(cmp.type(y.dtype)))


def evaluate_accuracy(net, data_iter):  #@save
    """Compute the accuracy for a model on a dataset."""
    if isinstance(net, torch.nn.Module):
        net.eval()  # Set the model to evaluation mode
    metric = Accumulator(2)  # No. of correct predictions, no. of predictions
    for X, y in data_iter:
        metric.add(accuracy(net(X), y), d2l.size(y))
    return metric[0] / metric[1]


class Accumulator:  #@save
    """For accumulating sums over `n` variables."""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    
def train_epoch_ch3(net, train_iter, loss, updater):  #@save
    """The training loop defined in Chapter 3."""
    # Set the model to training mode
    if isinstance(net, torch.nn.Module):
        net.train()
    # Sum of training loss, sum of training accuracy, no. of examples
    metric = Accumulator(3)
    for X, y in train_iter:
        # Compute gradients and update parameters
        y_hat = net(X)
        l = loss(y_hat, y)
        if isinstance(updater, torch.optim.Optimizer):
            # Using PyTorch in-built optimizer & loss criterion
            updater.zero_grad()
            l.backward()
            updater.step()
            metric.add(float(l) * len(y), accuracy(y_hat, y),
                       y.size().numel())
        else:
            # Using custom built optimizer & loss criterion
            l.sum().backward()
            updater(X.shape[0])
            metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
    # Return training loss and training accuracy
    return metric[0] / metric[2], metric[1] / metric[2]


class Animator:  #@save
    """For plotting data in animation."""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(5, 3)):
        # Incrementally plot multiple lines
        if legend is None:
            legend = []
        d2l.use_svg_display()
        self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # Use a lambda function to capture arguments
        self.config_axes = lambda: d2l.set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # Add multiple data points into the figure
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)
        
        
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater):  #@save
    """Train a model (defined in Chapter 3)."""
    animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
                        legend=['train loss', 'train acc', 'test acc'])
    for epoch in range(num_epochs):
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
        test_acc = evaluate_accuracy(net, test_iter)
        animator.add(epoch + 1, train_metrics + (test_acc,))
    train_loss, train_acc = train_metrics
    assert train_loss < 0.5, train_loss
    assert train_acc <= 1 and train_acc > 0.7, train_acc
    assert test_acc <= 1 and test_acc > 0.7, test_acc
    
    
numpy = lambda x, *args, **kwargs: x.detach().numpy(*args, **kwargs)

In [116]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.normal_(m.weight, std=0.01)

class NeuralNetwork():
    def __init__(self, net, loss, optimizer, lr, num_epochs, batch_size=32):
        self.net = net
        self.loss = loss
        self.optimizer = optimizer(self.net.parameters(), lr=lr)
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        
    def fit(self, X_tensor, y_tensor):
        self.net.apply(init_weights)
        train_tensor = data.TensorDataset(X_tensor, y_tensor) 
        train_iter = data.DataLoader(dataset = train_tensor, batch_size = self.batch_size, shuffle = True)
        
        for epoch in range(self.num_epochs):
            for X, y in train_iter:
                # Compute gradients and update parameters
                y_hat = self.net(X)
                l = self.loss(y_hat, y)
                self.optimizer.zero_grad()
                l.backward()
                self.optimizer.step()
        
        return self
        
    def predict(self, X_tensor):
        return numpy(self.net(X_tensor))[:, 1]
    
    def score(self, X_tensor, y_tensor):
        predictions = self.predict(X_tensor)
        return roc_auc_score(y_tensor.numpy(), predictions)

## H1N1

In [117]:
num_features = X_train.shape[1]
num_classes = 2
num_hidden = 128
dropout1, dropout2 = 0., 0.

lr = 0.1
num_epochs = 16

model_params = {"net": nn.Sequential(nn.Linear(num_features, num_hidden),
                                     nn.ReLU(),
                                     # Add a dropout layer after the first fully connected layer
                                     nn.Dropout(dropout1),
                                     nn.Linear(num_hidden, num_hidden),
                                     nn.ReLU(),
                                     # Add a dropout layer after the second fully connected layer
                                     nn.Dropout(dropout2),
                                     nn.Linear(num_hidden, num_classes)),
                "loss": nn.CrossEntropyLoss(),
                "optimizer": torch.optim.SGD,
                "lr": lr,
                "num_epochs": num_epochs,
                "batch_size": 32
                }

In [118]:
nn_model_h1n1 = NeuralNetwork(**model_params)

nn_model_h1n1.fit(X_train, y_train_h1n1)
nn_model_h1n1.score(X_test, y_test_h1n1)

0.8601649828397444

## Seasonal

In [119]:
num_features = X_train.shape[1]
num_classes = 2
num_hidden = 128
dropout1, dropout2 = 0., 0.

lr = 0.1
num_epochs = 16

model_params = {"net": nn.Sequential(nn.Linear(num_features, num_hidden),
                                     nn.ReLU(),
                                     # Add a dropout layer after the first fully connected layer
                                     nn.Dropout(dropout1),
                                     nn.Linear(num_hidden, num_hidden),
                                     nn.ReLU(),
                                     # Add a dropout layer after the second fully connected layer
                                     nn.Dropout(dropout2),
                                     nn.Linear(num_hidden, num_classes)),
                "loss": nn.CrossEntropyLoss(),
                "optimizer": torch.optim.SGD,
                "lr": lr,
                "num_epochs": num_epochs,
                "batch_size": 32
                }

In [120]:
nn_model_seasonal = NeuralNetwork(**model_params)

nn_model_seasonal.fit(X_train, y_train_seasonal)
nn_model_seasonal.score(X_test, y_test_seasonal)

0.858506945206877

## Performance

In [121]:
test_predictions_seasonal = nn_model_seasonal.predict(X_test)
test_predictions_h1n1 = nn_model_h1n1.predict(X_test)

combined_predictions = pd.DataFrame({"h1n1_vaccine": test_predictions_h1n1,
                                     "seasonal_vaccine": test_predictions_seasonal})

print(roc_auc_score(y_df_test, combined_predictions))

0.8593359640233107


In [122]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

GeForce RTX 2060
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


# Submission

In [82]:
X_holdout = pd.read_csv("../../data/test_set_features.csv")
X_holdout_processed = pipeline.transform(X_holdout.drop("respondent_id", axis=1))

holdout_predictions = cce.predict_proba(X_holdout_processed)

In [83]:
submission_df = pd.DataFrame(holdout_predictions, columns=['h1n1_vaccine', 'seasonal_vaccine'])
submission_df['respondent_id'] = X_holdout['respondent_id']
submission_df[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']].to_csv("../submissions/classifier_chain_gb.csv", index=False)