In [1]:
# 1. Introduction to Kaggle
# --- Torch ---
import numpy as np

test_t  = np.genfromtxt('KaggleData/kaggle_test.csv', delimiter=",")
print(test_t.shape)

train_t = np.genfromtxt('KaggleData/kaggle_train.csv', delimiter=",")
print(train_t.shape)

In [1]:
# --- Kaggle ---
import pandas as pd

test = pd.read_csv("KaggleData/test.csv")
test_shape = test.shape
print(test_shape)

train = pd.read_csv("KaggleData/train.csv")
train_shape = train.shape
print(train_shape)

(418, 11)
(891, 12)


In [2]:
train.head(2)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [3]:

def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1,0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']

train = process_age(train, cut_points, label_names)
test  = process_age(test,  cut_points, label_names)

In [4]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

def prepare_data(train, test):
    train = create_dummies(train,"Pclass")
    train = create_dummies(train,"Sex")
    train = create_dummies(train,"Age_categories")
    
    test = create_dummies(test,"Pclass")
    test = create_dummies(test,"Sex")
    test = create_dummies(test,"Age_categories")
    
    return train, test

In [5]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

In [6]:
train, test = prepare_data(train, test)
train.head(2)  # TODO why it doesn't work?

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Pclass_3,Sex_female,Sex_male,Age_categories_Missing,Age_categories_Infant,Age_categories_Child,Age_categories_Teenager,Age_categories_Young Adult,Age_categories_Adult,Age_categories_Senior
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,1,0,1,0,0,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,1,0,0,0,0,0,0,1,0


In [7]:
from sklearn.model_selection import train_test_split

all_X = train[columns]
all_y = train['Survived']

train_X, test_X, train_Y, test_Y = train_test_split(all_X, all_y, test_size=0.2)

In [93]:
# --- Pandas to Torch ---
import torch as th
import torch.utils.data

print('---> train_X:', train_X.shape)
print('---> train_Y:', train_Y.shape)
print('---> test_X:', test_X.shape)
print('---> test_Y:', test_Y.shape)

train_X_t = th.tensor(train_X.values)
train_Y_t = th.tensor(train_Y.values)

test_X_t  = th.tensor(test_X.values)
test_Y_t  = th.tensor(test_Y.values)

print('---> train_X_t:', train_X_t.shape)
print('---> train_Y_t:', train_Y_t.shape)
print('---> test_X_t:', test_X_t.shape)
print('---> test_Y_t:', test_Y_t.shape)

---> train_X: (712, 12)
---> train_Y: (712,)
---> test_X: (179, 12)
---> test_Y: (179,)
---> train_X_t: torch.Size([712, 12])
---> train_Y_t: torch.Size([712])
---> test_X_t: torch.Size([179, 12])
---> test_Y_t: torch.Size([179])


In [116]:
# *********** Create the model **********
from torch import nn
from collections import OrderedDict

model = nn.Sequential(OrderedDict([
    ('fc1',  nn.Linear(12, 4)),
    ('relu', nn.ReLU()),
    ('fc2',  nn.Linear(4, 2)),
    ('output', nn.LogSoftmax(dim=1))
]))

criterion = nn.NLLLoss()

In [117]:
def model_learning(obj_model, obj_opt, obj_data, obj_target):
    obj_opt.zero_grad()
    
    obj_pred = obj_model(obj_data.float())
    obj_loss = criterion(obj_pred, obj_target.long())
    obj_loss.backward()
    
    obj_opt.step()

    return obj_loss

In [139]:
epochs = 5
train_sets_num = 89
train_set_size = 8


loss = th.FloatTensor
#opt = torch.optim.Adam(model.parameters(), lr=0.001)
opt  = th.optim.SGD(params=model.parameters(), lr=0.1)

for ts_i in range(0, train_sets_num, train_set_size):
    running_loss = 0
    for e in range(epochs):
        running_loss += model_learning(model, opt,
                                      train_X_t[ts_i: ts_i+train_set_size],
                                      train_Y_t[ts_i: ts_i+train_set_size ]).item()
        #print('Running loss {}/{}: {}:'.format(e, ts_i, running_loss/((epoch+1)*train_sets_num)))
        print( 'Training loss:', running_loss/(epochs*train_sets_num) )
    else: 
        with th.no_grad():
            model.eval()
            test_loss = 0
            accuracy  = 0
            
            for ts_i in range(0, test_sets_num-2, train_set_size):
            
                pred = model(test_X_t[ts_i:ts_i+train_set_size].float())
                loss = criterion(pred, test_Y_t[ts_i:ts_i+train_set_size].long())
                test_loss += loss.item()

                ps = th.exp(pred)
                #print( 'ps:', ps)
                #m = th.max(ps, 1)[0]
                #print( 'm:', m)
                #equality = top_class == test_Y_t[ts_i:ts_i+train_set_size].view(*top_class.shape)
                #accuracy += th.mean(m)

        print( 'Test loss:', test_loss/(test_sets_num-2) )
        #print( 'Accuracy:', accuracy/(test_sets_num-2) )
        model.train()

Training loss: 0.0006561326846647799
Training loss: 0.0012308368522129702
Training loss: 0.0017451517032773308
Training loss: 0.0022136345673143196
Training loss: 0.002647269441840354
Test loss: 0.06451988494259188
Training loss: 0.0009284287356258778
Training loss: 0.0017340826184562084
Training loss: 0.0024428565180703494
Training loss: 0.0030713966723238486
Training loss: 0.003633142656154847
Test loss: 0.05506265694382547
Training loss: 0.0007269740104675293
Training loss: 0.0013647327262364076
Training loss: 0.0019387030869387509
Training loss: 0.002465239100241929
Training loss: 0.0029551402906353554
Test loss: 0.055568430273012184
Training loss: 0.0020988369255923152
Training loss: 0.004122180751200472
Training loss: 0.006086099951454762
Training loss: 0.007996186111750228
Training loss: 0.009856690717547127
Test loss: 0.059940769412051675
Training loss: 0.0007090594661369752
Training loss: 0.0014052130533068368
Training loss: 0.0020889639184716044
Training loss: 0.0027606995587