In [3]:
import pandas as pd
import numpy as np
import random
import torch
from torch import tensor


In [4]:
# import train csv into dataframe
dfTrain = pd.read_csv("train.csv", index_col=0)
dfTrain.drop(columns={"Name", "Ticket", "Cabin"}, inplace=True)
dfTrain

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [5]:
dfTrain.mode().loc[0]

Survived       0
Pclass         3
Sex         male
Age         24.0
SibSp          0
Parch          0
Fare        8.05
Embarked       S
Name: 0, dtype: object

In [6]:
# replace NaN values with the mode
dfTrain.fillna(value = dfTrain.mode().loc[0], inplace = True)
dfTrain

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,24.0,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [7]:
dfRefinedData = dfTrain[["SibSp", "Parch"]].copy()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,0,0
4,1,0
5,0,0
...,...,...
887,0,0
888,0,0
889,1,2
890,0,0


In [8]:
# Normalize the Age column
dfRefinedData.loc[:, "Norm_Age"] = dfTrain.loc[:, "Age"]/dfTrain.loc[:, "Age"].max()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,0.2750
2,1,0,0.4750
3,0,0,0.3250
4,1,0,0.4375
5,0,0,0.4375
...,...,...,...
887,0,0,0.3375
888,0,0,0.2375
889,1,2,0.3000
890,0,0,0.3250


In [9]:
# Due to unbalanced and large distribution of "Fare" column, take the log base 10 instead of normalizing it
dfRefinedData["Log_Fare"] = np.log10(dfTrain["Fare"] + 1)
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0.2750,0.916454
2,1,0,0.4750,1.859038
3,0,0,0.3250,0.950608
4,1,0,0.4375,1.733197
5,0,0,0.4375,0.956649
...,...,...,...,...
887,0,0,0.3375,1.146128
888,0,0,0.2375,1.491362
889,1,2,0.3000,1.388279
890,0,0,0.3250,1.491362


In [10]:
# Split categorical variables into booleans
dfRefinedData = dfRefinedData.join(pd.get_dummies(dfTrain.loc[:,['Sex', 'Embarked', 'Pclass']], columns = ['Sex', 'Embarked', 'Pclass']))
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,0.2750,0.916454,0,1,0,0,1,0,0,1
2,1,0,0.4750,1.859038,1,0,1,0,0,1,0,0
3,0,0,0.3250,0.950608,1,0,0,0,1,0,0,1
4,1,0,0.4375,1.733197,1,0,0,0,1,1,0,0
5,0,0,0.4375,0.956649,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,0,0.3375,1.146128,0,1,0,0,1,0,1,0
888,0,0,0.2375,1.491362,1,0,0,0,1,1,0,0
889,1,2,0.3000,1.388279,1,0,0,0,1,0,0,1
890,0,0,0.3250,1.491362,0,1,1,0,0,1,0,0


In [11]:
# Create results dataFrame to test against
actualResults = dfTrain.loc[:, 'Survived']
actualResults

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [12]:
# Create validation set
cutoff = (int) (0.8 * dfRefinedData.shape[0])

dfValidationData = dfRefinedData[cutoff:]
dfRefinedData = dfRefinedData[:cutoff]

dfValidationTest = actualResults[cutoff:]
actualResults = actualResults[:cutoff]

dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,0.2750,0.916454,0,1,0,0,1,0,0,1
2,1,0,0.4750,1.859038,1,0,1,0,0,1,0,0
3,0,0,0.3250,0.950608,1,0,0,0,1,0,0,1
4,1,0,0.4375,1.733197,1,0,0,0,1,1,0,0
5,0,0,0.4375,0.956649,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
708,0,0,0.5250,1.435964,0,1,0,0,1,1,0,0
709,0,0,0.2750,2.183412,1,0,0,0,1,1,0,0
710,1,1,0.3000,1.210741,0,1,1,0,0,0,0,1
711,0,0,0.3000,1.703327,1,0,1,0,0,1,0,0


In [13]:
# Make pytorch tensors for gradient descent
t_indep = tensor(dfRefinedData.values, dtype=torch.float)
t_dep = tensor(actualResults.values)
t_indep

tensor([[1.0000, 0.0000, 0.2750,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000, 0.0000, 0.4750,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3250,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [1.0000, 1.0000, 0.3000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 0.3000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3000,  ..., 1.0000, 0.0000, 0.0000]])

In [14]:
t_indep.shape

torch.Size([712, 12])

In [15]:
# Generate random coefficients for each column and enable gradients
coeffs = torch.rand(t_indep.shape[1]) - 0.5
coeffs.requires_grad_()
coeffs

tensor([ 0.2206,  0.1680,  0.1906, -0.4218,  0.0956,  0.3735,  0.3451,  0.0685,
        -0.4024,  0.0705, -0.3185, -0.0860], requires_grad=True)

In [16]:
t_indep.max(dim = 0)

torch.return_types.max(
values=tensor([8.0000, 6.0000, 1.0000, 2.7104, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000]),
indices=tensor([159, 678, 630, 258,   1,   0,   1,   5,   0,   1,   9,   0]))

In [17]:
# Maxes still range an order of magnitude, lets normalize them all
maxValues, maxIndices = t_indep.max(dim = 0)
t_indep = t_indep/maxValues
t_indep.max(dim = 0)

torch.return_types.max(
values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
indices=tensor([159, 678, 630, 258,   1,   0,   1,   5,   0,   1,   9,   0]))

In [18]:
# Make the first set of predictions
predictions = (t_indep * coeffs).sum(axis = 1)
predictions[:10]

tensor([-0.1775,  0.3400, -0.4788, -0.3950, -0.1803,  0.2614, -0.0978, -0.2085,
        -0.4411, -0.0491], grad_fn=<SliceBackward0>)

In [19]:
# Measure how well we did
loss = torch.abs(predictions-t_dep).mean()
loss

tensor(0.6484, grad_fn=<MeanBackward0>)

In [20]:
# define the above functions as we will call them repeatedly
def calc_predictions(coeffs, indeps): return (indeps*coeffs).sum(axis=1)
def calc_loss(coeffs, indeps, deps): return torch.abs(calc_predictions(coeffs, indeps)-deps).mean()

In [21]:
# Calculate the gradients
loss.backward()

In [22]:
# Use the calculated gradients (but careful not to trigger another gradient calculation when using 'coeffs')
with torch.no_grad():
    coeffs.sub_(coeffs.grad * 0.1) # Subtracts the gradient (trying to min loss, not max it) by an arbitrary step
    coeffs.grad.zero_() # Zero out the gradients (otherwise .backward() will add to it)
    print(calc_loss(coeffs, t_indep, t_dep)) # Let's see if we improved

tensor(0.5441)


In [23]:
# That worked well, let's make it a function
def update_coeffs(coeffs, lr):
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()

In [24]:
# Now to wrap it all up in a function
def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, t_indep, t_dep)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

In [25]:
# Let's try it out
for i in range(80):
    one_epoch(coeffs, 0.05)
coeffs

0.544; 0.523; 0.512; 0.502; 0.491; 0.481; 0.471; 0.461; 0.451; 0.441; 0.432; 0.424; 0.417; 0.410; 0.404; 0.398; 0.394; 0.389; 0.385; 0.382; 0.378; 0.375; 0.372; 0.369; 0.366; 0.363; 0.360; 0.358; 0.355; 0.353; 0.351; 0.349; 0.347; 0.346; 0.344; 0.342; 0.340; 0.338; 0.337; 0.335; 0.333; 0.332; 0.330; 0.329; 0.327; 0.326; 0.324; 0.323; 0.321; 0.320; 0.318; 0.317; 0.315; 0.313; 0.312; 0.310; 0.309; 0.307; 0.306; 0.304; 0.303; 0.301; 0.300; 0.298; 0.297; 0.295; 0.294; 0.292; 0.291; 0.289; 0.288; 0.287; 0.286; 0.284; 0.283; 0.282; 0.280; 0.279; 0.278; 0.277; 

tensor([ 0.1751,  0.1588,  0.1600, -0.2242,  0.7373,  0.0584,  0.2373,  0.0496,
         0.0508,  0.1342, -0.0574, -0.0842], requires_grad=True)

In [26]:
# How well is this truly doing though?
v_indep = tensor(dfValidationData.values, dtype=torch.float)
v_dep = tensor(dfValidationTest.values)

In [27]:
with torch.no_grad():
    print(calc_loss(coeffs, v_indep, v_dep))

tensor(0.3528)


In [28]:
# That looked pretty good, lets categorize the predictions more thoroughly
with torch.no_grad():
    predictions = calc_predictions(coeffs, v_indep)

results = v_dep.bool() == (predictions > 0.5)
results.float().mean()

tensor(0.7598)

In [29]:
#let's also wrap up our accuracy function into a function for ease of testing
def calc_accuracy(coeffs):
    with torch.no_grad():
        return (v_dep.bool() == (calc_predictions(coeffs, v_indep) > 0.5)).float().mean()
calc_accuracy(coeffs)

tensor(0.7598)

In [30]:
#Start of notebook 2
#let's start by adding a sigmoid and see if that improves our prediction
def calc_predictions(coeffs, indeps): return torch.sigmoid((coeffs * indeps).sum(axis=1))


In [31]:
calc_accuracy(coeffs)

tensor(0.7151)

In [32]:
#retroatively defining this here after realizing I probably want it in the next segment
def show_coeffs(coeffs): return dict(zip(dfRefinedData.columns, coeffs.requires_grad_(False)))
print(show_coeffs(coeffs))

{'SibSp': tensor(0.1751), 'Parch': tensor(0.1588), 'Norm_Age': tensor(0.1600), 'Log_Fare': tensor(-0.2242), 'Sex_female': tensor(0.7373), 'Sex_male': tensor(0.0584), 'Embarked_C': tensor(0.2373), 'Embarked_Q': tensor(0.0496), 'Embarked_S': tensor(0.0508), 'Pclass_1': tensor(0.1342), 'Pclass_2': tensor(-0.0574), 'Pclass_3': tensor(-0.0842)}


In [33]:
#hmmm, that didn't go so well. Let's see if that changes with we include it with the model since the beginning
def train_new_model(epochs=100, lr=0.05):
    #generate fresh coefficients
    coeffs = torch.rand(t_indep.shape[1]) - 0.5
    coeffs.requires_grad_()
    #train the model
    for i in range(epochs):
        one_epoch(coeffs, lr)
    print(show_coeffs(coeffs))
    print('Accuracy:', calc_accuracy(coeffs))

train_new_model()

0.533; 0.532; 0.531; 0.530; 0.529; 0.528; 0.527; 0.526; 0.525; 0.524; 0.523; 0.522; 0.521; 0.520; 0.519; 0.518; 0.517; 0.516; 0.515; 0.514; 0.513; 0.512; 0.511; 0.510; 0.509; 0.508; 0.507; 0.506; 0.505; 0.504; 0.503; 0.502; 0.501; 0.500; 0.499; 0.498; 0.497; 0.496; 0.496; 0.495; 0.494; 0.493; 0.492; 0.491; 0.490; 0.489; 0.488; 0.487; 0.486; 0.485; 0.484; 0.484; 0.483; 0.482; 0.481; 0.480; 0.479; 0.478; 0.477; 0.476; 0.476; 0.475; 0.474; 0.473; 0.472; 0.471; 0.470; 0.470; 0.469; 0.468; 0.467; 0.466; 0.465; 0.465; 0.464; 0.463; 0.462; 0.461; 0.461; 0.460; 0.459; 0.458; 0.458; 0.457; 0.456; 0.455; 0.455; 0.454; 0.453; 0.452; 0.452; 0.451; 0.450; 0.449; 0.449; 0.448; 0.447; 0.447; 0.446; 0.445; {'SibSp': tensor(-0.2868), 'Parch': tensor(0.4333), 'Norm_Age': tensor(-0.2331), 'Log_Fare': tensor(-0.1717), 'Sex_female': tensor(0.0756), 'Sex_male': tensor(-0.1042), 'Embarked_C': tensor(0.0697), 'Embarked_Q': tensor(0.2554), 'Embarked_S': tensor(-0.5197), 'Pclass_1': tensor(-0.2589), 'Pclass_2':

In [34]:
#better, for kicks lets try more epochs
train_new_model(epochs = 150)

0.436; 0.435; 0.435; 0.434; 0.434; 0.433; 0.432; 0.432; 0.431; 0.431; 0.430; 0.430; 0.429; 0.428; 0.428; 0.427; 0.427; 0.426; 0.426; 0.425; 0.425; 0.424; 0.424; 0.423; 0.422; 0.422; 0.421; 0.421; 0.420; 0.420; 0.419; 0.419; 0.418; 0.418; 0.417; 0.417; 0.416; 0.416; 0.416; 0.415; 0.415; 0.414; 0.414; 0.413; 0.413; 0.412; 0.412; 0.411; 0.411; 0.410; 0.410; 0.410; 0.409; 0.409; 0.408; 0.408; 0.407; 0.407; 0.407; 0.406; 0.406; 0.405; 0.405; 0.405; 0.404; 0.404; 0.403; 0.403; 0.403; 0.402; 0.402; 0.401; 0.401; 0.401; 0.400; 0.400; 0.400; 0.399; 0.399; 0.398; 0.398; 0.398; 0.397; 0.397; 0.397; 0.396; 0.396; 0.396; 0.395; 0.395; 0.394; 0.394; 0.394; 0.393; 0.393; 0.393; 0.392; 0.392; 0.392; 0.391; 0.391; 0.391; 0.390; 0.390; 0.390; 0.389; 0.389; 0.389; 0.388; 0.388; 0.388; 0.387; 0.387; 0.387; 0.387; 0.386; 0.386; 0.386; 0.385; 0.385; 0.385; 0.384; 0.384; 0.384; 0.384; 0.383; 0.383; 0.383; 0.382; 0.382; 0.382; 0.381; 0.381; 0.381; 0.381; 0.380; 0.380; 0.380; 0.379; 0.379; 0.379; 0.379; 0.378;

# Adding layers
Great, we've gotten a reasonable model using a single layer, a sigmoid, and 150 steps. Let's start by adding layers.

First step is to use matrix-vector products, which basically do our calculation of (variables * coeffs).sum(axis=1), except they will automatically repeat for multiple rows of coeffs and is heavily optimized. We do this using the '@' operator.

In [35]:
def calc_predictions(coeffs, indeps): return torch.sigmoid(indeps@coeffs)

In [36]:
train_new_model(epochs=150)

0.505; 0.504; 0.503; 0.502; 0.501; 0.500; 0.499; 0.498; 0.497; 0.496; 0.495; 0.494; 0.493; 0.492; 0.491; 0.490; 0.489; 0.488; 0.488; 0.487; 0.486; 0.485; 0.484; 0.483; 0.482; 0.481; 0.480; 0.479; 0.478; 0.477; 0.476; 0.475; 0.474; 0.473; 0.472; 0.471; 0.470; 0.469; 0.468; 0.467; 0.466; 0.465; 0.464; 0.463; 0.462; 0.461; 0.460; 0.459; 0.458; 0.458; 0.457; 0.456; 0.455; 0.454; 0.453; 0.452; 0.451; 0.450; 0.449; 0.448; 0.447; 0.446; 0.445; 0.444; 0.444; 0.443; 0.442; 0.441; 0.440; 0.439; 0.438; 0.437; 0.436; 0.436; 0.435; 0.434; 0.433; 0.432; 0.431; 0.430; 0.429; 0.429; 0.428; 0.427; 0.426; 0.425; 0.424; 0.424; 0.423; 0.422; 0.421; 0.420; 0.420; 0.419; 0.418; 0.417; 0.416; 0.416; 0.415; 0.414; 0.413; 0.413; 0.412; 0.411; 0.410; 0.410; 0.409; 0.408; 0.407; 0.407; 0.406; 0.405; 0.404; 0.404; 0.403; 0.402; 0.402; 0.401; 0.400; 0.400; 0.399; 0.398; 0.398; 0.397; 0.396; 0.396; 0.395; 0.394; 0.394; 0.393; 0.392; 0.392; 0.391; 0.390; 0.390; 0.389; 0.389; 0.388; 0.387; 0.387; 0.386; 0.386; 0.385;

In [37]:
#on a whim, I want to see what happens when we increase the learning rate
train_new_model(epochs = 150, lr = 0.1)

0.463; 0.462; 0.460; 0.458; 0.456; 0.454; 0.452; 0.450; 0.448; 0.447; 0.445; 0.443; 0.441; 0.439; 0.438; 0.436; 0.434; 0.432; 0.431; 0.429; 0.427; 0.426; 0.424; 0.422; 0.421; 0.419; 0.418; 0.416; 0.414; 0.413; 0.411; 0.410; 0.409; 0.407; 0.406; 0.404; 0.403; 0.401; 0.400; 0.399; 0.397; 0.396; 0.395; 0.394; 0.392; 0.391; 0.390; 0.389; 0.388; 0.386; 0.385; 0.384; 0.383; 0.382; 0.381; 0.380; 0.379; 0.378; 0.376; 0.375; 0.374; 0.373; 0.372; 0.372; 0.371; 0.370; 0.369; 0.368; 0.367; 0.366; 0.365; 0.364; 0.363; 0.362; 0.362; 0.361; 0.360; 0.359; 0.358; 0.358; 0.357; 0.356; 0.355; 0.354; 0.354; 0.353; 0.352; 0.352; 0.351; 0.350; 0.349; 0.349; 0.348; 0.347; 0.347; 0.346; 0.345; 0.345; 0.344; 0.343; 0.343; 0.342; 0.342; 0.341; 0.340; 0.340; 0.339; 0.339; 0.338; 0.337; 0.337; 0.336; 0.336; 0.335; 0.335; 0.334; 0.334; 0.333; 0.333; 0.332; 0.331; 0.331; 0.330; 0.330; 0.329; 0.329; 0.328; 0.328; 0.327; 0.327; 0.327; 0.326; 0.326; 0.325; 0.325; 0.324; 0.324; 0.323; 0.323; 0.322; 0.322; 0.322; 0.321;

### So far so good
Great, this seems to be working properly, but before we go on I want to increase the modularity of our learning function to make messing with the model easier

In [38]:
def make_coeffs(layers=1): return (torch.rand(n_coeff, 1, requires_grad=True) * 0.1)

In [39]:
def train_new_model(epochs=100, lr=0.1, layers=1):
    #generate fresh coefficients
    coeffs = make_coeffs(layers)
    #train the model
    for i in range(epochs):
        one_epoch(coeffs, lr)
    print(show_coeffs(coeffs))
    print('Accuracy:', calc_accuracy(coeffs))