In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import copy
import time


from zipfile import ZipFile
from io import BytesIO
from typing import Any, Dict

num_epochs = 50
batch_size = 10

### Prepare Data

In [2]:
df = pd.read_csv('./data/titanic_dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,22.0,1,0,7.25,0
1,1,female,38.0,1,0,71.2833,1
2,3,female,26.0,0,0,7.925,1
3,1,female,35.0,1,0,53.1,1
4,3,male,35.0,0,0,8.05,0


In [4]:
df['Sex'] = df['Sex'].map({"male": 0, "female": 1})
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,0,22.0,1,0,7.25,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.925,1
3,1,1,35.0,1,0,53.1,1
4,3,0,35.0,0,0,8.05,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int64  
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Survived  891 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


In [6]:
df.fillna(df.mean(numeric_only=True), inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int64  
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Survived  891 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


## Create Dataset

In [7]:
class TitanicDataset(Dataset):
    def __init__(self):
        self.df = pd.read_csv('./data/titanic_dataset.csv')
        self.df_labels = self.df[['Survived']]
        
        self.df = self.df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
        self.df['Sex'] = self.df['Sex'].map({"male": 0, "female": 1})
        self.df.fillna(self.df.mean(numeric_only=True), inplace=True)
        
        self.df_train, self.df_test, self.labels_train, self.labels_test = train_test_split(self.df, self.df_labels, test_size = 0.2)
        
#         self.scaler=preprocessing.StandardScaler().fit(self.df_train)
#         for data in [self.df_train, self.df_test]:
#             data=self.scaler.transform(data)
        
        self.set_train(True)
 

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, i):
        return self.dataset[i], int(self.labels[i])
#         return self.dataset[i], self.labels[i]
        
    def set_train(self, train = True):
        if train:
            self.dataset=torch.tensor(self.df_train.to_numpy()).float()
            self.labels=torch.tensor(self.labels_train.to_numpy().reshape(-1)).long()
        else:
            self.dataset=torch.tensor(self.df_test.to_numpy()).float()
            self.labels=torch.tensor(self.labels_test.to_numpy().reshape(-1)).long()
        return self
    
    def set_test(self):
        self.set_train(False)
        return self


In [8]:
dataset = TitanicDataset()
train_dataset = copy.deepcopy(dataset)

# dataset.set_test()
test_dataset = copy.deepcopy(dataset.set_test())

print(train_dataset[0], test_dataset[0])

(tensor([ 1.,  0., 31.,  1.,  0., 52.]), 0) (tensor([ 3.0000,  1.0000,  0.7500,  2.0000,  1.0000, 19.2583]), 1)


## Create DataLoader

In [9]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)


### Build Model

In [10]:
class Titanic_Net(nn.Module):
    def __init__(self, input_size=6, num_classes=1):
        super(Titanic_Net, self).__init__()
        self.layer1 = nn.Linear(6, 32)
        self.layer2 = nn.Linear(32, 8)
        self.layer3 = nn.Linear(8, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.sigmoid(self.layer3(x))
        return x

## Define train and test functions

In [11]:
def train(num_epochs, network, loader):
    
    network.train()
    
    optimizer = torch.optim.Adam(network.parameters(), lr=1e-3)
    loss_func = nn.BCELoss()
    
    epoch_start_time = time.time()
    for epoch in range(num_epochs):
        for (batch_x, batch_y) in loader:
            optimizer.zero_grad()           
            pred = network(batch_x)
            batch_y = batch_y.unsqueeze(1)
            loss = loss_func(pred, batch_y.float())
              
            loss.backward()                      
            optimizer.step()      
            
        if epoch % 10 == 0:
            print(f'epoch: {epoch}, time: {time.time()-epoch_start_time}')
            epoch_start_time = time.time()

In [12]:
def test(network, loader):
    
    acc = 0
    samples = 0

    for (batch_x, batch_y) in loader:
        with torch.no_grad():
            pred = network(batch_x).round()

        for i, j in zip(pred, batch_y):
            acc += 1 if i == j else 0
            samples += 1
    print(acc/samples*100)

In [13]:
titanic_net = Titanic_Net()
titanic_net

Titanic_Net(
  (layer1): Linear(in_features=6, out_features=32, bias=True)
  (layer2): Linear(in_features=32, out_features=8, bias=True)
  (layer3): Linear(in_features=8, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [14]:
train(num_epochs, titanic_net, train_loader)

epoch: 0, time: 0.09510588645935059
epoch: 10, time: 0.7611098289489746
epoch: 20, time: 0.7779271602630615
epoch: 30, time: 0.7621631622314453
epoch: 40, time: 0.6360254287719727


In [15]:
test(titanic_net, test_loader)

78.77094972067039


In [20]:
%run -i -t titanic.py {num_epochs} {batch_size}

titanic_DPL
Caching ACs
Training  for 50 epoch(s)
Iteration:  500 	s:0.3852 	Average Loss:  27.575759073696904
Iteration:  1000 	s:0.4218 	Average Loss:  27.63102111592876
Iteration:  1500 	s:0.3345 	Average Loss:  27.63102111592876
Iteration:  2000 	s:0.3525 	Average Loss:  27.63102111592876
Iteration:  2500 	s:0.3387 	Average Loss:  27.63102111592876
Iteration:  3000 	s:0.3263 	Average Loss:  27.63102111592876
Iteration:  3500 	s:0.3571 	Average Loss:  27.63102111592876

IPython CPU timings (estimated):
  User   :       2.59 s.
  System :       0.02 s.
Wall time:       2.63 s.


In [17]:
def load_from_dpl(path):
    net = Titanic_Net()
    
    with ZipFile(path) as zipf:
        with zipf.open('titanic_net') as f:
            loaded: Dict[str, Any] = torch.load(BytesIO(f.read()))
            net.load_state_dict(loaded["model_state_dict"])
    return net

In [18]:
titanic_DPL = load_from_dpl('snapshot/titanic_DPL.pth')
titanic_DPL

Titanic_Net(
  (layer1): Linear(in_features=6, out_features=32, bias=True)
  (layer2): Linear(in_features=32, out_features=8, bias=True)
  (layer3): Linear(in_features=8, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [19]:
test(titanic_DPL, test_loader)

64.24581005586593
