In [1]:
import pandas as pd
import numpy as np
import random
import torch
from torch import tensor


In [2]:
# import train csv into dataframe
dfTrain = pd.read_csv("train.csv", index_col=0)
dfTrain.drop(columns={"Name", "Ticket", "Cabin"}, inplace=True)
dfTrain

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [3]:
dfTrain.mode().loc[0]

Survived       0
Pclass         3
Sex         male
Age         24.0
SibSp          0
Parch          0
Fare        8.05
Embarked       S
Name: 0, dtype: object

In [4]:
# replace NaN values with the mode
dfTrain.fillna(value = dfTrain.mode().loc[0], inplace = True)
dfTrain

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,24.0,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [5]:
dfRefinedData = dfTrain[["SibSp", "Parch"]].copy()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,0,0
4,1,0
5,0,0
...,...,...
887,0,0
888,0,0
889,1,2
890,0,0


In [6]:
# Normalize the Age column
dfRefinedData.loc[:, "Norm_Age"] = dfTrain.loc[:, "Age"]/dfTrain.loc[:, "Age"].max()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,0.2750
2,1,0,0.4750
3,0,0,0.3250
4,1,0,0.4375
5,0,0,0.4375
...,...,...,...
887,0,0,0.3375
888,0,0,0.2375
889,1,2,0.3000
890,0,0,0.3250


In [7]:
# Due to unbalanced and large distribution of "Fare" column, take the log base 10 instead of normalizing it
dfRefinedData["Log_Fare"] = np.log10(dfTrain["Fare"] + 1)
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0.2750,0.916454
2,1,0,0.4750,1.859038
3,0,0,0.3250,0.950608
4,1,0,0.4375,1.733197
5,0,0,0.4375,0.956649
...,...,...,...,...
887,0,0,0.3375,1.146128
888,0,0,0.2375,1.491362
889,1,2,0.3000,1.388279
890,0,0,0.3250,1.491362


In [8]:
# Split categorical variables into booleans
dfRefinedData = dfRefinedData.join(pd.get_dummies(dfTrain.loc[:,['Sex', 'Embarked', 'Pclass']], columns = ['Sex', 'Embarked', 'Pclass']))
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,0.2750,0.916454,0,1,0,0,1,0,0,1
2,1,0,0.4750,1.859038,1,0,1,0,0,1,0,0
3,0,0,0.3250,0.950608,1,0,0,0,1,0,0,1
4,1,0,0.4375,1.733197,1,0,0,0,1,1,0,0
5,0,0,0.4375,0.956649,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,0,0.3375,1.146128,0,1,0,0,1,0,1,0
888,0,0,0.2375,1.491362,1,0,0,0,1,1,0,0
889,1,2,0.3000,1.388279,1,0,0,0,1,0,0,1
890,0,0,0.3250,1.491362,0,1,1,0,0,1,0,0


In [9]:
# Create results dataFrame to test against
actualResults = dfTrain.loc[:, 'Survived']
actualResults

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [10]:
# Create validation set
cutoff = (int) (0.8 * dfRefinedData.shape[0])

dfValidationData = dfRefinedData[cutoff:]
dfRefinedData = dfRefinedData[:cutoff]

dfValidationTest = actualResults[cutoff:]
actualResults = actualResults[:cutoff]

dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,0.2750,0.916454,0,1,0,0,1,0,0,1
2,1,0,0.4750,1.859038,1,0,1,0,0,1,0,0
3,0,0,0.3250,0.950608,1,0,0,0,1,0,0,1
4,1,0,0.4375,1.733197,1,0,0,0,1,1,0,0
5,0,0,0.4375,0.956649,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
708,0,0,0.5250,1.435964,0,1,0,0,1,1,0,0
709,0,0,0.2750,2.183412,1,0,0,0,1,1,0,0
710,1,1,0.3000,1.210741,0,1,1,0,0,0,0,1
711,0,0,0.3000,1.703327,1,0,1,0,0,1,0,0


In [11]:
# Make pytorch tensors for gradient descent
t_indep = tensor(dfRefinedData.values, dtype=torch.float)
t_dep = tensor(actualResults.values)
t_indep

tensor([[1.0000, 0.0000, 0.2750,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000, 0.0000, 0.4750,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3250,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [1.0000, 1.0000, 0.3000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 0.3000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3000,  ..., 1.0000, 0.0000, 0.0000]])

In [12]:
t_indep.shape

torch.Size([712, 12])

In [13]:
# Generate random coefficients for each column and enable gradients
coeffs = torch.rand(t_indep.shape[1]) - 0.5
coeffs.requires_grad_()
coeffs

tensor([ 0.4538,  0.1769, -0.4692, -0.2587, -0.4581, -0.4553, -0.4070,  0.1650,
        -0.2835,  0.3860, -0.1291, -0.2172], requires_grad=True)

In [14]:
t_indep.max(dim = 0)

torch.return_types.max(
values=tensor([8.0000, 6.0000, 1.0000, 2.7104, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000]),
indices=tensor([159, 678, 630, 258,   1,   0,   1,   5,   0,   1,   9,   0]))

In [15]:
# Maxes still range an order of magnitude, lets normalize them all
maxValues, maxIndices = t_indep.max(dim = 0)
t_indep = t_indep/maxValues
t_indep.max(dim = 0)

torch.return_types.max(
values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
indices=tensor([159, 678, 630, 258,   1,   0,   1,   5,   0,   1,   9,   0]))

In [16]:
# Make the first set of predictions
predictions = (t_indep * coeffs).sum(axis = 1)
predictions[:10]

tensor([-1.1158, -0.8227, -1.2021, -0.6696, -1.2527, -0.7415, -0.8341, -0.8964,
        -1.1617, -1.1621], grad_fn=<SliceBackward0>)

In [17]:
# Measure how well we did
loss = torch.abs(predictions-t_dep).mean()
loss

tensor(1.4159, grad_fn=<MeanBackward0>)

In [18]:
# define the above functions as we will call them repeatedly
def calc_predictions(coeffs, indeps): return (indeps*coeffs).sum(axis=1)
def calc_loss(coeffs, indeps, deps): return torch.abs(calc_predictions(coeffs, indeps)-deps).mean()

In [19]:
# Calculate the gradients
loss.backward()

In [20]:
# Use the calculated gradients (but careful not to trigger another gradient calculation when using 'coeffs')
with torch.no_grad():
    coeffs.sub_(coeffs.grad * 0.1) # Subtracts the gradient (trying to min loss, not max it) by an arbitrary step
    coeffs.grad.zero_() # Zero out the gradients (otherwise .backward() will add to it)
    print(calc_loss(coeffs, t_indep, t_dep)) # Let's see if we improved

tensor(1.2291)


In [21]:
# That worked well, let's make it a function
def update_coeffs(coeffs, lr):
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()

In [22]:
# Now to wrap it all up in a function
def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, t_indep, t_dep)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

In [23]:
# Let's try it out
for i in range(80):
    one_epoch(coeffs, 0.05)
coeffs

1.229; 1.136; 1.043; 0.950; 0.858; 0.772; 0.698; 0.633; 0.575; 0.527; 0.489; 0.476; 0.467; 0.459; 0.452; 0.445; 0.438; 0.432; 0.427; 0.422; 0.417; 0.413; 0.409; 0.405; 0.402; 0.397; 0.394; 0.390; 0.387; 0.383; 0.379; 0.376; 0.372; 0.369; 0.365; 0.362; 0.359; 0.357; 0.354; 0.351; 0.348; 0.346; 0.343; 0.340; 0.338; 0.335; 0.333; 0.330; 0.328; 0.325; 0.324; 0.322; 0.320; 0.318; 0.317; 0.315; 0.313; 0.312; 0.310; 0.309; 0.308; 0.307; 0.305; 0.304; 0.303; 0.302; 0.300; 0.300; 0.298; 0.298; 0.297; 0.297; 0.295; 0.295; 0.293; 0.293; 0.291; 0.291; 0.290; 0.290; 

tensor([ 0.4133,  0.1802, -0.1203,  0.1333,  0.2749, -0.2908,  0.0776,  0.1848,
         0.1096,  0.5648,  0.1998,  0.1725], requires_grad=True)

In [24]:
# How well is this truly doing though?
v_indep = tensor(dfValidationData.values, dtype=torch.float)
v_dep = tensor(dfValidationTest.values)

In [25]:
with torch.no_grad():
    print(calc_loss(coeffs, v_indep, v_dep))

tensor(0.4685)


In [26]:
# That looked pretty good, lets categorize the predictions more thoroughly
with torch.no_grad():
    predictions = calc_predictions(coeffs, v_indep)

results = v_dep.bool() == (predictions > 0.5)
results.float().mean()

tensor(0.7486)

In [27]:
#let's also wrap up our accuracy function into a function for ease of testing
def calc_accuracy(coeffs):
    with torch.no_grad():
        return (v_dep.bool() == (calc_predictions(coeffs, v_indep) > 0.5)).float().mean()
calc_accuracy(coeffs)

tensor(0.7486)

In [28]:
#Start of notebook 2
#let's start by adding a sigmoid and see if that improves our prediction
def calc_predictions(coeffs, indeps): return torch.sigmoid((coeffs * indeps).sum(axis=1))


In [29]:
calc_accuracy(coeffs)

tensor(0.3631)

In [30]:
#retroatively defining this here after realizing I probably want it in the next segment
def show_coeffs(coeffs): return dict(zip(dfRefinedData.columns, coeffs.requires_grad_(False)))
print(show_coeffs(coeffs))

{'SibSp': tensor(0.4133), 'Parch': tensor(0.1802), 'Norm_Age': tensor(-0.1203), 'Log_Fare': tensor(0.1333), 'Sex_female': tensor(0.2749), 'Sex_male': tensor(-0.2908), 'Embarked_C': tensor(0.0776), 'Embarked_Q': tensor(0.1848), 'Embarked_S': tensor(0.1096), 'Pclass_1': tensor(0.5648), 'Pclass_2': tensor(0.1998), 'Pclass_3': tensor(0.1725)}


In [31]:
#hmmm, that didn't go so well. Let's see if that changes with we include it with the model since the beginning
def train_new_model(epochs=100, lr=0.05):
    #generate fresh coefficients
    coeffs = torch.rand(t_indep.shape[1]) - 0.5
    coeffs.requires_grad_()
    #train the model
    for i in range(epochs):
        one_epoch(coeffs, lr)
    print(show_coeffs(coeffs))
    print('Accuracy:', calc_accuracy(coeffs))

train_new_model()

0.544; 0.544; 0.543; 0.542; 0.542; 0.541; 0.540; 0.539; 0.539; 0.538; 0.537; 0.536; 0.536; 0.535; 0.534; 0.533; 0.532; 0.532; 0.531; 0.530; 0.529; 0.528; 0.528; 0.527; 0.526; 0.525; 0.524; 0.524; 0.523; 0.522; 0.521; 0.520; 0.519; 0.518; 0.518; 0.517; 0.516; 0.515; 0.514; 0.513; 0.512; 0.511; 0.510; 0.510; 0.509; 0.508; 0.507; 0.506; 0.505; 0.504; 0.503; 0.502; 0.501; 0.500; 0.499; 0.498; 0.497; 0.496; 0.496; 0.495; 0.494; 0.493; 0.492; 0.491; 0.490; 0.489; 0.488; 0.487; 0.486; 0.485; 0.484; 0.483; 0.482; 0.481; 0.480; 0.479; 0.478; 0.477; 0.476; 0.475; 0.474; 0.473; 0.472; 0.471; 0.470; 0.469; 0.468; 0.467; 0.466; 0.465; 0.464; 0.463; 0.462; 0.461; 0.460; 0.459; 0.458; 0.457; 0.456; 0.455; {'SibSp': tensor(0.1142), 'Parch': tensor(0.2721), 'Norm_Age': tensor(0.1357), 'Log_Fare': tensor(0.3442), 'Sex_female': tensor(0.5779), 'Sex_male': tensor(0.0097), 'Embarked_C': tensor(0.0184), 'Embarked_Q': tensor(-0.1369), 'Embarked_S': tensor(-0.0655), 'Pclass_1': tensor(-0.0903), 'Pclass_2': te

In [32]:
#better, for kicks lets try more epochs
train_new_model(epochs = 150)

0.545; 0.544; 0.543; 0.543; 0.542; 0.541; 0.541; 0.540; 0.539; 0.538; 0.538; 0.537; 0.536; 0.535; 0.534; 0.534; 0.533; 0.532; 0.531; 0.530; 0.530; 0.529; 0.528; 0.527; 0.526; 0.526; 0.525; 0.524; 0.523; 0.522; 0.521; 0.521; 0.520; 0.519; 0.518; 0.517; 0.516; 0.515; 0.514; 0.513; 0.513; 0.512; 0.511; 0.510; 0.509; 0.508; 0.507; 0.506; 0.505; 0.504; 0.503; 0.502; 0.502; 0.501; 0.500; 0.499; 0.498; 0.497; 0.496; 0.495; 0.494; 0.493; 0.492; 0.491; 0.490; 0.489; 0.488; 0.487; 0.486; 0.485; 0.484; 0.483; 0.482; 0.481; 0.480; 0.479; 0.478; 0.477; 0.476; 0.475; 0.474; 0.473; 0.472; 0.471; 0.470; 0.469; 0.468; 0.467; 0.466; 0.465; 0.464; 0.463; 0.462; 0.461; 0.460; 0.459; 0.458; 0.457; 0.456; 0.455; 0.454; 0.453; 0.452; 0.451; 0.450; 0.449; 0.448; 0.447; 0.446; 0.445; 0.445; 0.444; 0.443; 0.442; 0.441; 0.440; 0.439; 0.438; 0.437; 0.436; 0.435; 0.434; 0.433; 0.432; 0.431; 0.431; 0.430; 0.429; 0.428; 0.427; 0.426; 0.425; 0.424; 0.423; 0.423; 0.422; 0.421; 0.420; 0.419; 0.418; 0.417; 0.417; 0.416;