In [1]:
# 1. Introduction to Kaggle
# --- Torch ---
import numpy as np

test_t  = np.genfromtxt('KaggleData/kaggle_test.csv', delimiter=",")
print(test_t.shape)

train_t = np.genfromtxt('KaggleData/kaggle_train.csv', delimiter=",")
print(train_t.shape)

In [1]:
# --- Kaggle ---
import pandas as pd

test = pd.read_csv("KaggleData/test.csv")
test_shape = test.shape
print(test_shape)

train = pd.read_csv("KaggleData/train.csv")
train_shape = train.shape
print(train_shape)

(418, 11)
(891, 12)


In [2]:
train.head(2)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [3]:

def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1,0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']

train = process_age(train, cut_points, label_names)
test  = process_age(test,  cut_points, label_names)

In [4]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

def prepare_data(data):
    data = create_dummies(data,"Pclass")
    data = create_dummies(data,"Sex")
    data = create_dummies(data,"Age_categories")
    
    return data

In [5]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

In [6]:
train = prepare_data(train)
print('train.shape:', train.shape)
test = prepare_data(test)
print('test.shape:', test.shape)

train.shape: (891, 25)
test.shape: (418, 24)


In [7]:
from sklearn.model_selection import train_test_split

all_X = train[columns]
all_y = train['Survived']

train_X, test_X, train_Y, test_Y = train_test_split(all_X, all_y, test_size=0.2)

In [8]:
# --- Pandas to Torch ---
import torch as th

print('---> train_X:', train_X.shape)
print('---> train_Y:', train_Y.shape)
print('---> test_X:', test_X.shape)
print('---> test_Y:', test_Y.shape)

train_X_t = th.tensor(train_X.values)
train_Y_t = th.tensor(train_Y.values)

test_X_t  = th.tensor(test_X.values)
test_Y_t  = th.tensor(test_Y.values)

---> train_X: (712, 12)
---> train_Y: (712,)
---> test_X: (179, 12)
---> test_Y: (179,)


In [9]:
# *********** Create the model **********
from torch import nn
from collections import OrderedDict

model = nn.Sequential(OrderedDict([
    ('fc1',  nn.Linear(12, 4)),
    ('relu', nn.ReLU()),
    ('drop', nn.Dropout(p=0.2)),
    ('fc2',  nn.Linear(4, 2)),
    ('output', nn.Sigmoid())
]))

In [10]:
def model_learning(obj_model, obj_opt, obj_data, obj_target):
    obj_opt.zero_grad()
    
    obj_pred = obj_model(obj_data.float())
    obj_loss = criterion(obj_pred, obj_target.long())
    obj_loss.backward()
    
    obj_opt.step()

    return obj_loss

In [13]:
epochs = 50
train_sets_num = 89
train_set_size = 8


loss = th.FloatTensor
criterion = nn.CrossEntropyLoss()
opt = th.optim.Adam(model.parameters(), lr=0.01)

for e in range(epochs):
    running_loss = 0
    for ts_i in range(0, train_sets_num, train_set_size):
        running_loss += model_learning(model, opt,
                                      train_X_t[ts_i: ts_i+train_set_size],
                                      train_Y_t[ts_i: ts_i+train_set_size ]).item()
    else: 
        with th.no_grad():
            model.eval()
            test_loss = 0
            accuracy  = 0

            pred = model(test_X_t.float())
            loss = criterion(pred, test_Y_t.long())
            test_loss += loss.item()

            output = th.max(pred, 1)[1]
            accuracy = th.mean((output == test_Y_t.long()).float())
                
        print('Accuracy {:.2f}'.format(accuracy * 100.0))

        model.train()

Accuracy 48.60
Accuracy 58.66
Accuracy 58.66
Accuracy 59.78
Accuracy 68.16
Accuracy 78.77
Accuracy 77.09
Accuracy 77.09


Accuracy 77.09
Accuracy 77.09
Accuracy 77.09
Accuracy 77.09
Accuracy 77.09
Accuracy 78.77
Accuracy 78.77


Accuracy 78.77
Accuracy 78.77
Accuracy 78.77
Accuracy 79.33
Accuracy 79.33
Accuracy 79.33
Accuracy 79.33


Accuracy 79.33
Accuracy 79.33
Accuracy 79.33
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89


Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 80.45
Accuracy 79.89
Accuracy 79.89


Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 79.89
Accuracy 80.45
Accuracy 80.45
Accuracy 80.45
Accuracy 80.45
Accuracy 80.45
Accuracy 80.45


In [24]:
d = test[columns]
d_tensor = th.tensor(d.values)
res = model(d_tensor.float()) 
holdout_predictions = th.max(res, 1)[1]

In [26]:
holdout_ids = test["PassengerId"]
print('holdout_predictions:', holdout_predictions)



In [28]:
submission_df = {"PassengerId": holdout_ids,
                 "Survived": holdout_predictions}
submission = pd.DataFrame(submission_df)


In [29]:
submission.to_csv('KaggleData/titanic_submission.csv', index=False)