<h1>Code Summary</h1>

<h2>NB (Linear model and neural net from scratch)</h2>

In [None]:
import os
from pathlib import Path
from torch import tensor

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: path = Path('../input/titanic')
else:
    path = Path('titanic')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

import torch, numpy as np, pandas as pd
np.set_printoptions(linewidth=140)
torch.set_printoptions(linewidth=140, sci_mode=False, edgeitems=7)
pd.set_option('display.width', 140)

###### DATA PREP ######
# Create pandas DataFrame object from the csv file
df = pd.read_csv(path/'train.csv') 

# Replace missing values (NaN) with the mode of each column
print(f"Missing values per col BEFORE:\n{df.isna().sum()}")
# Selet the mode (most common used value) of each column. The iloc is for integer based indexing into the array.
modes = df.mode().iloc[0]
df.fillna(modes, inplace=True) # Fill missing values with the mode of the corresponding col
print(f"\nMissing values per col AFTER:\n{df.isna().sum()}")

# We need to normalize the Fare column, because it has much bigger values than the other columns and would end up dominating all results
# We do this by taking the log of the (Fare+1), the +1 is to remove any 0 values before applying the log
df['LogFare'] = np.log(df['Fare']+1)

# Handle non-numeric columns
print(f"\nNon-numeric columns in the dataframe:\n{df.describe(include=[object])}")
# For columns with low cardinality, we can use dummy variables (one-hot encoded)
df = pd.get_dummies(df, columns=["Sex","Pclass","Embarked"])

# Create the training input and target tensors
added_cols = ['Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
indep_cols = ['Age', 'SibSp', 'Parch', 'LogFare'] + added_cols
df[added_cols] = df[added_cols].astype(float)
t_dep = tensor(df.Survived)
t_indep = tensor(df[indep_cols].values)
print(f"\nInput/label tensors shapes:{t_indep.shape} / {t_dep.shape}")


###### LINEAR MODEL ######

Missing values per col BEFORE:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values per col AFTER:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Non-numeric columns in the dataframe:
                       Name   Sex Ticket    Cabin Embarked
count                   891   891    891      891      891
unique                  891     2    681      147        3
top     Dooley, Mr. Patrick  male   1601  B96 B98        S
freq                      1   577      7      691      646

Input/label tensors shapes:torch.Size([891, 12]) / torch.Size([891])


In [None]:
df.describe(include=(np.number))

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,28.56697,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.199572,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,24.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


<h1>Theory Summary</h1>

<h2>Book chapter 09</h2>