## Pseudo-Images with an CNN ##
<br>
Because so many columns are replaced by one-hot-encoding, the records consist nearly only of zero-one entries. <br>
In this note-book we reshape each record into a 10 x 10 pseudo-image consisting of zero and ones and train a convolutional network on these data. <br>

### Result: ###
It does not work - the accuracy of the CNN is merely above 50 percent.

### Other Models to Consider: ###
1. Net-In-Net - one-dimensional CNN
2. Logistic Regression with label-smoothing

In [1]:
import pandas as pd

# read the csv-files:

X_train_df = pd.read_csv("training_set_features.csv")
y_train_df = pd.read_csv("training_set_labels.csv")
X_test_df = pd.read_csv("test_set_features.csv")

#X_train_df.shape
# Output:
# (26707, 36)

#X_test_df.shape
# out:
# (26708, 36)

In [2]:
# onehot encoding categorical columns:

categorial_columns = ["h1n1_concern", "h1n1_knowledge", "opinion_h1n1_vacc_effective", "opinion_h1n1_risk", "opinion_h1n1_sick_from_vacc",
"opinion_seas_vacc_effective", "opinion_seas_risk", "opinion_seas_sick_from_vacc", "age_group", "education", "race", "sex", "income_poverty", "marital_status",
"rent_or_own", "employment_status", "hhs_geo_region", "census_msa", "household_adults", "household_children", "employment_industry", "employment_occupation"]

binary_columns = ["h1n1_concern", "h1n1_knowledge", "behavioral_antiviral_meds", "behavioral_avoidance", "behavioral_face_mask", "behavioral_wash_hands", 
"behavioral_large_gatherings", "behavioral_outside_home", "behavioral_touch_face", "doctor_recc_h1n1", "doctor_recc_seasonal",	"chronic_med_condition",
"child_under_6_months",	"health_worker", "health_insurance"]



In [3]:
# fill missing values with 0 for all binary columns:

X_train_df[binary_columns] = X_train_df[binary_columns].fillna(0)
X_test_df[binary_columns] = X_test_df[binary_columns].fillna(0)

In [4]:
# categorial one-hot-encoding with drop first and dummy-variable for missing values:

X_train_df = pd.concat([X_train_df, pd.get_dummies(X_train_df[categorial_columns], drop_first=True, dummy_na=True)], axis=1)
X_train_df.drop(categorial_columns, axis=1, inplace=True)

X_test_df = pd.concat([X_test_df, pd.get_dummies(X_test_df[categorial_columns], drop_first=True, dummy_na=True)], axis=1)
X_test_df.drop(categorial_columns, axis=1, inplace=True)


In [5]:
# train eval split:
import numpy as np
from sklearn.model_selection import train_test_split

eval_size = 0.1

X_train_np, X_eval_np, y_train_np, y_eval_np = train_test_split(X_train_df.iloc[:,1:].to_numpy(), y_train_df.iloc[:,1:].to_numpy(), test_size=eval_size, shuffle=True)

# output types are numpy.ndarray

In [8]:
# prepare the labels for the two predictions we have to make:

#y_train_h1n1 = y_train_np[:,:1].ravel()
#y_train_seasonal = y_train_np[:,1:].ravel()
#y_eval_h1n1 = y_eval_np[:,:1].ravel()
#y_eval_seasonal = y_eval_np[:,1:].ravel()

#for s, a in zip(["y_train_h1n1", "y_train_seasonal", "y_eval_h1n1", "y_eval_seasonal"], [y_train_h1n1, y_train_seasonal, y_eval_h1n1, y_eval_seasonal]):
#    print(f"Shape of {s}: {a.shape}")

Shape of y_train_h1n1: (21365,)
Shape of y_train_seasonal: (21365,)
Shape of y_test_h1n1: (5342,)
Shape of y_test_seasonal: (5342,)


In [6]:
# map the two-valued y's to 4 categories:

# y = 0 ,0 -> 0 - "not vaccinated"
# 1, 0 -> 1 = "only seasonal"
# 0, 1 -> 2 = "only h1n1"
# 1, 1 -> 3 = "seasonal and h1n1"

flu_shot_categories = { 0: "not vaccinated", 1: "only seasonal", 2 : "only h1n1", 3 : "seasonal and h1n1"}

import numpy as np
import torch
import torch.nn.functional as F

def multiLableTocategory(l):
    if np.array_equal(l, [0,0]):
        return 0
    if np.array_equal(l,[0,1]):
        return 1
    if np.array_equal(l, [1,0]):
        return 2
    if np.array_equal(l, [1,1]):
        return 3

# test
multiLableTocategory([0,1])
#out:
# 1
flu_shot_categories[multiLableTocategory([0,1])]

'only seasonal'

### CNN ###
Pad the (1, 95) feature vectors to (1, 100) and reshape to a 10 x 10 image...

In [93]:
import torch.nn as nn
import torch.nn.functional as F

class fluNet(nn.Module):
    def __init__(self, batch_size):
        super(fluNet, self).__init__()
        self.conv1 = nn.Conv2d(1,16,kernel_size=3,padding=1)
        self.act1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.act2 = nn.ReLU()
        self.flattened = nn.Flatten()
        self.fc1 = nn.Linear(200, 16)
        self.act3 = nn.ReLU()
        self.fc2 = nn.Linear(16, 4)
    
    def forward(self, x):
        x = self.pool1(self.act1(self.conv1(x)))
        x = self.act2(self.conv2(x))
        x = self.flattened(x)
        x = self.act3(self.fc1(x))
        x = self.fc2(x)
        p = F.log_softmax(x, dim=0)
        return p



In [99]:
# check if fluNet works nummerically:
import torch

batch_size = 2

T = torch.rand(batch_size, 1, 10, 10)
print(f" T.shape: {T.shape}")

flu_clf = fluNet(batch_size)
#print(flu_clf)
flu_clf(torch.Tensor(T))

# out:
# T.shape: torch.Size([1, 1, 10, 10])
# tensor([-1.2340, -1.3331, -1.4486, -1.5591], grad_fn=<LogSoftmaxBackward>)

 T.shape: torch.Size([2, 1, 10, 10])


tensor([[-0.6972, -0.7027, -0.7024, -0.6982],
        [-0.6891, -0.6837, -0.6840, -0.6881]], grad_fn=<LogSoftmaxBackward>)

In [11]:
def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in range(1, n_epochs+1):
        for x, y in train_loader:
            out = model(x)
            loss = loss_fn(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_train += loss.item()

            if epoch ==1 or epoch % 10 ==0:
                print(f"Epoch: {epoch}, Training loss: {loss_train / len(train_loader)}")

In [12]:
def evaluation_loop(model, loss_fn, train_loader, test_loader):
    for name, loader in [("train", train_loader), ("test", test_loader)]:
        correct = 0
        total = 0
        with torch.no_grad():
            for xs,ys in loader:
                outs = model(xs)
                y_preds = torch.argmax(outs, dim=1)
                total += y_preds.shape[0]
                correct += (y_preds == ys).sum().item()
        
        print(f"Accuracy {name}: {correct/ total:.2} ")


In [140]:
def training_eval_loop(n_epochs, optimizer, model, loss_fn, train_loader, eval_loader):
    for epoch in range(1, n_epochs+1):
        model.train()
        loss_train = 0
        loss_eval = 0
        total = 0
        correct = 0
        for x, y in train_loader:
            out = model(x)
            loss = loss_fn(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_train += loss.item()
        
        model.eval()
        with torch.no_grad():
            for ex, ey in eval_loader:
                eout = model(ex)
                eloss = loss_fn(eout, ey)
                loss_eval += eloss.item()
                y_preds = torch.argmax(eout, dim=1)
                total += y_preds.shape[0]
                correct += (y_preds == ey).sum().item()


        if epoch == 1 or epoch % 20 == 0:
            print(f"Epoch: {epoch}, Training loss: {loss_train / len(train_loader):.3}")
            print(f"Epoch evaluation loss: {loss_eval / len(eval_loader):.3}")
            print(f"Epoch evaluation accuracy: {(correct/ total):.2}")

In [117]:
from torch.utils.data import Dataset

# data_np is a numpy array with shape (Batch x 1 x W x H) = (NumberOfImages_in_batch, 1, width, hight); batching will be done by the dataloader class

class pseudoImagesDataset(Dataset):
    def __init__(self, data_np, labels_np):
        self.pseudoImages = torch.tensor(data_np.astype("float32"))
        self.labels = torch.tensor(labels_np, dtype= torch.long)
    def __len__(self):
        return self.pseudoImages.shape[0]
    def __getitem__(self, idx):
        img = self.pseudoImages[idx,:,:]
        label = self.labels[idx]
        return (img, label)


In [133]:
# convert all entries of y_train_np (and y_test_np) and stack them:

y_train_c = np.array([multiLableTocategory(r) for r in y_train_np]).astype("int")
y_eval_c = np.array([multiLableTocategory(r) for r in y_eval_np]).astype("int")

# check:
print(type(y_train_c[0]))
print(len(y_train_c))

<class 'numpy.int32'>
24036


In [134]:
import numpy as np

pre_pad = np.zeros((X_train_np.shape[0], 5))
X_train_pad = np.concatenate((pre_pad, X_train_np), axis=1)
X_train_imgs = np.reshape(X_train_pad, (X_train_pad.shape[0], 1, 10,10))

print(f"X_train_imgs.shape: {X_train_imgs.shape}")
## out:
# (21365, 10, 10)
## check:
#img0=X_train_imgs[0,:,:]
#print(img0)

pre_pad = np.zeros((X_eval_np.shape[0], 5))
X_eval_pad = np.concatenate((pre_pad, X_eval_np), axis=1)
X_eval_imgs = np.reshape(X_eval_pad, (X_eval_pad.shape[0], 1, 10, 10))

print(f"X_eval_imgs.shape: {X_eval_imgs.shape}")

X_test_np = X_test_df.iloc[:,1:].to_numpy()
pre_pad = np.zeros((X_test_np.shape[0], 5))
X_test_pad = np.concatenate((pre_pad, X_test_np), axis=1)
X_test_imgs = np.reshape(X_test_pad, (X_test_pad.shape[0], 1, 10, 10))

print(f"X_test_imgs.shape: {X_test_imgs.shape}")

X_train_imgs.shape: (24036, 1, 10, 10)
X_eval_imgs.shape: (2671, 1, 10, 10)
X_test_imgs.shape: (26708, 1, 10, 10)


In [141]:
from torch.utils.data import DataLoader

batch_size = 64

train_set = pseudoImagesDataset(X_train_imgs, y_train_c)
eval_set = pseudoImagesDataset(X_eval_imgs, y_eval_c)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_set, batch_size=batch_size, shuffle=True)

# train the model:

n_epochs = 100
model = fluNet(batch_size)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

#loss_fn = torch.nn.CrossEntropyLoss()
loss_fn = torch.nn.NLLLoss()


training_eval_loop(n_epochs = n_epochs, optimizer = optimizer, model = model, loss_fn = loss_fn, train_loader = train_loader, eval_loader = eval_loader)

#y_test = model(X_test_imgs)


Epoch: 1, Training loss: 4.0
Epoch evaluation loss: 3.95
Epoch evaluation accuracy: 0.51
Epoch: 20, Training loss: 3.92
Epoch evaluation loss: 3.96
Epoch evaluation accuracy: 0.54
Epoch: 40, Training loss: 3.9
Epoch evaluation loss: 3.96
Epoch evaluation accuracy: 0.53
Epoch: 60, Training loss: 3.89
Epoch evaluation loss: 3.97
Epoch evaluation accuracy: 0.51
Epoch: 80, Training loss: 3.89
Epoch evaluation loss: 3.98
Epoch evaluation accuracy: 0.52
Epoch: 100, Training loss: 3.88
Epoch evaluation loss: 3.99
Epoch evaluation accuracy: 0.51
