In [137]:
import pandas as pd
import numpy as np
import random
import torch
from torch import tensor


In [138]:
# import train csv into dataframe
dfTrain = pd.read_csv("train.csv", index_col=0)
dfTrain.drop(columns={"Name", "Ticket", "Cabin"}, inplace=True)
dfTrain

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [139]:
dfTrain.mode().loc[0]

Survived       0
Pclass         3
Sex         male
Age         24.0
SibSp          0
Parch          0
Fare        8.05
Embarked       S
Name: 0, dtype: object

In [140]:
# replace NaN values with the mode
dfTrain.fillna(value = dfTrain.mode().loc[0], inplace = True)
dfTrain

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,24.0,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [141]:
dfRefinedData = dfTrain[["SibSp", "Parch"]].copy()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,0,0
4,1,0
5,0,0
...,...,...
887,0,0
888,0,0
889,1,2
890,0,0


In [142]:
# Normalize the Age column
dfRefinedData.loc[:, "Norm_Age"] = dfTrain.loc[:, "Age"]/dfTrain.loc[:, "Age"].max()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,0.2750
2,1,0,0.4750
3,0,0,0.3250
4,1,0,0.4375
5,0,0,0.4375
...,...,...,...
887,0,0,0.3375
888,0,0,0.2375
889,1,2,0.3000
890,0,0,0.3250


In [143]:
# Due to unbalanced and large distribution of "Fare" column, take the log base 10 instead of normalizing it
dfRefinedData["Log_Fare"] = np.log10(dfTrain["Fare"] + 1)
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0.2750,0.916454
2,1,0,0.4750,1.859038
3,0,0,0.3250,0.950608
4,1,0,0.4375,1.733197
5,0,0,0.4375,0.956649
...,...,...,...,...
887,0,0,0.3375,1.146128
888,0,0,0.2375,1.491362
889,1,2,0.3000,1.388279
890,0,0,0.3250,1.491362


In [144]:
# Split categorical variables into booleans
dfRefinedData = dfRefinedData.join(pd.get_dummies(dfTrain.loc[:,['Sex', 'Embarked', 'Pclass']], columns = ['Sex', 'Embarked', 'Pclass']))
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,0.2750,0.916454,0,1,0,0,1,0,0,1
2,1,0,0.4750,1.859038,1,0,1,0,0,1,0,0
3,0,0,0.3250,0.950608,1,0,0,0,1,0,0,1
4,1,0,0.4375,1.733197,1,0,0,0,1,1,0,0
5,0,0,0.4375,0.956649,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,0,0.3375,1.146128,0,1,0,0,1,0,1,0
888,0,0,0.2375,1.491362,1,0,0,0,1,1,0,0
889,1,2,0.3000,1.388279,1,0,0,0,1,0,0,1
890,0,0,0.3250,1.491362,0,1,1,0,0,1,0,0


In [145]:
# Create results dataFrame to test against
actualResults = dfTrain.loc[:, 'Survived']
actualResults

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [146]:
# Create validation set
cutoff = (int) (0.8 * dfRefinedData.shape[0])

dfValidationData = dfRefinedData[cutoff:]
dfRefinedData = dfRefinedData[:cutoff]

dfValidationTest = actualResults[cutoff:]
actualResults = actualResults[:cutoff]

dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,0.2750,0.916454,0,1,0,0,1,0,0,1
2,1,0,0.4750,1.859038,1,0,1,0,0,1,0,0
3,0,0,0.3250,0.950608,1,0,0,0,1,0,0,1
4,1,0,0.4375,1.733197,1,0,0,0,1,1,0,0
5,0,0,0.4375,0.956649,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
708,0,0,0.5250,1.435964,0,1,0,0,1,1,0,0
709,0,0,0.2750,2.183412,1,0,0,0,1,1,0,0
710,1,1,0.3000,1.210741,0,1,1,0,0,0,0,1
711,0,0,0.3000,1.703327,1,0,1,0,0,1,0,0


In [147]:
# Make pytorch tensors for gradient descent
t_indep = tensor(dfRefinedData.values, dtype=torch.float)
t_dep = tensor(actualResults.values)
t_indep

tensor([[1.0000, 0.0000, 0.2750,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000, 0.0000, 0.4750,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3250,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [1.0000, 1.0000, 0.3000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 0.3000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3000,  ..., 1.0000, 0.0000, 0.0000]])

In [148]:
t_indep.shape

torch.Size([712, 12])

In [149]:
# Generate random coefficients for each column and enable gradients
coeffs = torch.rand(t_indep.shape[1]) - 0.5
coeffs.requires_grad_()
coeffs

tensor([-0.0258, -0.0114,  0.0609, -0.1768, -0.2518, -0.2723, -0.1756,  0.4147,
         0.4849,  0.1708, -0.0851, -0.1930], requires_grad=True)

In [150]:
t_indep.max(dim = 0)

torch.return_types.max(
values=tensor([8.0000, 6.0000, 1.0000, 2.7104, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000]),
indices=tensor([159, 678, 630, 258,   1,   0,   1,   5,   0,   1,   9,   0]))

In [151]:
# Maxes still range an order of magnitude, lets normalize them all
maxValues, maxIndices = t_indep.max(dim = 0)
t_indep = t_indep/maxValues
t_indep.max(dim = 0)

torch.return_types.max(
values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
indices=tensor([159, 678, 630, 258,   1,   0,   1,   5,   0,   1,   9,   0]))

In [152]:
# Make the first set of predictions
predictions = (t_indep * coeffs).sum(axis = 1)
predictions[:10]

tensor([-0.0266, -0.3521, -0.0021,  0.3143, -0.0161, -0.0959,  0.3121, -0.0781,
        -0.0138, -0.6024], grad_fn=<SliceBackward0>)

In [153]:
# Measure how well we did
loss = torch.abs(predictions-t_dep).mean()
loss

tensor(0.5043, grad_fn=<MeanBackward0>)

In [154]:
# define the above functions as we will call them repeatedly
def calc_predictions(coeffs, indeps): return (indeps*coeffs).sum(axis=1)
def calc_loss(coeffs, indeps, deps): return torch.abs(calc_predictions(coeffs, indeps)-deps).mean()

In [155]:
# Calculate the gradients
loss.backward()

In [156]:
# Use the calculated gradients (but careful not to trigger another gradient calculation when using 'coeffs')
with torch.no_grad():
    coeffs.sub_(coeffs.grad * 0.1) # Subtracts the gradient (trying to min loss, not max it) by an arbitrary step
    coeffs.grad.zero_() # Zero out the gradients (otherwise .backward() will add to it)
    print(calc_loss(coeffs, t_indep, t_dep)) # Let's see if we improved

tensor(0.4892)


In [157]:
# That worked well, let's make it a function
def update_coeffs(coeffs, lr):
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()

In [158]:
# Now to wrap it all up in a function
def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, t_indep, t_dep)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

In [159]:
# Let's try it out
for i in range(20):
    one_epoch(coeffs, 0.05)
coeffs

0.489; 0.478; 0.469; 0.461; 0.453; 0.446; 0.440; 0.434; 0.428; 0.423; 0.417; 0.414; 0.406; 0.401; 0.396; 0.392; 0.385; 0.383; 0.377; 0.375; 

tensor([-0.0027,  0.0098,  0.0986, -0.0459, -0.0237, -0.3126,  0.0279,  0.4513,
         0.4327,  0.2924, -0.0681, -0.1438], requires_grad=True)

In [160]:
# How well is this truly doing though?
v_indep = tensor(dfValidationData.values, dtype=torch.float)
v_dep = tensor(dfValidationTest.values)

In [161]:
with torch.no_grad():
    print(calc_loss(coeffs, v_indep, v_dep))

tensor(0.3498)


In [163]:
# That looked pretty good, lets categorize the predictions more thoroughly
with torch.no_grad():
    predictions = calc_predictions(coeffs, v_indep)

results = v_dep.bool() == (predictions > 0.5)
results.float().mean()

tensor(0.7263)