In [1]:
import pandas as pd

# read the csv-files:

X_train_df = pd.read_csv("training_set_features.csv")
y_train_df = pd.read_csv("training_set_labels.csv")
X_test_df = pd.read_csv("test_set_features.csv")

#X_train_df.shape
# Output:
# (26707, 36)

#X_test_df.shape
# out:
# (26708, 36)

In [2]:
# onehot encoding categorical columns:

categorial_columns = ["h1n1_concern", "h1n1_knowledge", "opinion_h1n1_vacc_effective", "opinion_h1n1_risk", "opinion_h1n1_sick_from_vacc",
"opinion_seas_vacc_effective", "opinion_seas_risk", "opinion_seas_sick_from_vacc", "age_group", "education", "race", "sex", "income_poverty", "marital_status",
"rent_or_own", "employment_status", "hhs_geo_region", "census_msa", "household_adults", "household_children", "employment_industry", "employment_occupation"]

binary_columns = ["h1n1_concern", "h1n1_knowledge", "behavioral_antiviral_meds", "behavioral_avoidance", "behavioral_face_mask", "behavioral_wash_hands", 
"behavioral_large_gatherings", "behavioral_outside_home", "behavioral_touch_face", "doctor_recc_h1n1", "doctor_recc_seasonal",	"chronic_med_condition",
"child_under_6_months",	"health_worker", "health_insurance"]



In [3]:
# fill missing values with 0 for all binary columns:

X_train_df[binary_columns] = X_train_df[binary_columns].fillna(0)
X_test_df[binary_columns] = X_test_df[binary_columns].fillna(0)

In [4]:
# categorial one-hot-encoding with drop first and dummy-variable for missing values:

X_train_df = pd.concat([X_train_df, pd.get_dummies(X_train_df[categorial_columns], drop_first=True, dummy_na=True)], axis=1)
X_train_df.drop(categorial_columns, axis=1, inplace=True)

X_test_df = pd.concat([X_test_df, pd.get_dummies(X_test_df[categorial_columns], drop_first=True, dummy_na=True)], axis=1)
X_test_df.drop(categorial_columns, axis=1, inplace=True)


In [7]:
# train test split:
from sklearn.model_selection import train_test_split

X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(X_train_df.iloc[:,1:].to_numpy(), y_train_df.iloc[:,1:].to_numpy(), test_size=0.2, shuffle=True)

# output types are numpy.ndarray

In [8]:
# prepare the labels for the two predictions we have to make:

y_train_h1n1 = y_train_np[:,:1].ravel()
y_train_seasonal = y_train_np[:,1:].ravel()
y_test_h1n1 = y_test_np[:,:1].ravel()
y_test_seasonal = y_test_np[:,1:].ravel()

for s, a in zip(["y_train_h1n1", "y_train_seasonal", "y_test_h1n1", "y_test_seasonal"], [y_train_h1n1, y_train_seasonal, y_test_h1n1, y_test_seasonal]):
    print(f"Shape of {s}: {a.shape}")

Shape of y_train_h1n1: (21365,)
Shape of y_train_seasonal: (21365,)
Shape of y_test_h1n1: (5342,)
Shape of y_test_seasonal: (5342,)


In [40]:
# map the two-valued y's to 4 one-hot encoded classes:
# y = 0 ,0 -> class = 1,0,0,0 "not vaccinated"
# 1, 0 -> 0,1,0,0 "only seasonal"
# 0, 1 -> 0,0,1,0 "only h1n1"
# 1, 1 -> 0,0,0,1 "seasonal and h1n1"

import torch
import torch.nn.functional as F

def multiLable2OneHot(l):
    if l == [0,0]:
        return torch.Tensor([0,0,0,0])
    if l == [0,1]:
        return torch.Tensor([0,1,0,0])
    if l == [1,0]:
        return torch.Tensor([0,0,1,0])
    if l == [1,1]:
        return torch.Tensor([0,0,0,1])

# test
#multiLable2OneHot([0,0])
#out:
# tensor([0., 0., 0., 0.])

tensor([0., 0., 0., 0.])

### CNN ###
Pad the (1, 95) feature vectors to (1, 100) and reshape to a 10 x 10 image...

In [9]:
import numpy as np

pre_pad = np.zeros((X_train_np.shape[0], 5))
X_train_pad = np.concatenate((pre_pad, X_train_np), axis=1)
X_train_imgs = np.reshape(X_train_pad, (X_train_pad.shape[0],10,10))
#X_train_imgs.shape
X_train_imgs[0,:,:]

array([[0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class fluNet(nn.Module):
    def __init__(self):
        super(fluNet, self).__init__()
        self.conv1 = nn.Conv2d(1,16,kernel_size=3,padding=1)
        self.act1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.act2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(32, 16)
        self.act3 = nn.ReLU()
        self.fc2 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = self.pool1(self.act1(self.conv1(x)))
        x = self.pool2(self.act2(self.conv2(x)))
        x = torch.flatten(x)
        x = self.act3(self.fc1(x))
        x = self.fc2(x)
        p = F.log_softmax(x, dim=0)
        return p



In [None]:
import torch

#X = torch.rand(1, 1, 10, 10)
#print(f" X.shape: {X.shape}")

flu_clf = fluNet()
#print(flu_clf)
flu_clf(torch.Tensor(img0))