TABULOR MODEL FRON SCRATCH

In [None]:
%pip install torch numpy pandas fastai

Cleaning the data

In [None]:
import pandas as pd

df = pd.read_csv('./Titanic/train.csv')
df



In [None]:
df.isna().sum()



In [None]:

modes = df.mode().iloc[0]

In [None]:
df.fillna(modes, inplace=True)

In [None]:
import numpy as np
df.describe(include=(np.number))

In [None]:
df['Fare'].hist()

In [None]:
df['LogFare'] = np.log1p(df['Fare'])

In [None]:
df['LogFare'].hist()

In [None]:
pclasses = sorted(df.Pclass.unique())
pclasses

In [None]:
df.describe(include='object')

In [None]:
df = pd.get_dummies(df,columns=["Sex","Pclass","Embarked"])
df.columns

In [None]:
import torch
from torch import tensor

t_dep = tensor(df.Survived)
added_cols= ['Sex_male','Sex_female','Pclass_1','Pclass_2','Pclass_3','Embarked_C','Embarked_Q','Embarked_S']
indep_cols = ['Age','SibSp','Parch','LogFare'] + added_cols
t_indep = tensor(df[indep_cols].values, dtype=torch.float32)
t_indep.shape
df[indep_cols].isna().sum()

In [None]:
torch.manual_seed(442)

n_coeff = t_indep.shape[1]
coeffs = torch.randn(n_coeff) - 0.5
coeffs * t_indep

In [None]:
## Divise by max to normalize value
vals,indices = t_indep.max(dim=0)
t_indep = t_indep / vals
t_indep * coeffs

In [None]:
preds = (t_indep * coeffs).sum(axis=1)
preds[:10]

In [None]:
loss = torch.abs(preds - t_dep).mean()
loss

In [None]:
def calc_preds(coeffs,indeps): return (indeps * coeffs).sum(axis=1)
def calc_loss(coeffs,indeps,dep): return torch.abs(calc_preds(coeffs,indeps) - dep).mean()

In [None]:
coeffs.requires_grad_()

In [None]:
loss = calc_loss(coeffs,t_indep,t_dep)
loss

In [None]:
loss.backward()

In [None]:
coeffs.grad

In [None]:
with torch.no_grad():
    coeffs.sub_(coeffs.grad * 0.1)
    print(calc_loss(coeffs,t_indep,t_dep))

In [None]:
from fastai.data.transforms import RandomSplitter
trn_split,val_split=RandomSplitter(seed=42)(df)

trn_indep,val_indep = t_indep[trn_split],t_indep[val_split]

In [None]:
trn_indep,val_indep = t_indep[trn_split],t_indep[val_split]
trn_dep,val_dep = t_dep[trn_split],t_dep[val_split]
len(trn_indep),len(val_indep)

In [None]:
def update_coeffs(coeffs,lr): coeffs.data.sub_(coeffs.grad * lr)

In [None]:
def one_epoch(coeffs,lr):
    preds = calc_preds(coeffs,trn_indep)
    loss = torch.abs(preds - trn_dep).mean()
    loss.backward()
    with torch.no_grad():
        update_coeffs(coeffs,lr)
    print(f"{loss:.4f}",end='; ')

In [None]:
def init_coeffs(): return (torch.randn(n_coeff) - 0.5).requires_grad_()

In [None]:
def train_model(epochs=30,lr=0.01):

    coeffs = init_coeffs()
    for i in range(epochs):
        one_epoch(coeffs,lr)
    return coeffs

In [None]:
coeffs = train_model(100,lr=0.02)
coeffs

In [None]:
def shows_coeffs(): return dict(zip(indep_cols,coeffs.requires_grad_(False)))

shows_coeffs()

In [None]:
preds = calc_preds(coeffs,val_indep)
results = val_dep.bool() == (preds > 0.5)
results[:15]

In [None]:
results.float().mean()

In [None]:
def acc(coeffs): return (val_dep.bool() == (calc_preds(coeffs,val_indep) > 0.5)).float().mean()
acc(coeffs)

In [None]:
def calc_preds(coeffs,indeps): return torch.sigmoid((indeps * coeffs).sum(axis=1))

In [None]:
coeffs = train_model(150,lr=2)

In [None]:
acc(coeffs)

In [None]:
tst_df = pd.read_csv('./Titanic/test.csv')
tst_df['Fare'] = tst_df.Fare.fillna(0)

In [None]:
tst_df.fillna(modes, inplace=True)
tst_df['LogFare'] = np.log1p(tst_df['Fare'])
tst_df = pd.get_dummies(tst_df,columns=["Sex","Pclass","Embarked"])

tst_indep = tensor(tst_df[indep_cols].values, dtype=torch.float)
tst_indep = tst_indep / vals

In [None]:
tst_df['Survived'] = (calc_preds(coeffs,tst_indep) > 0.5).int()

In [None]:
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('submission.csv',index=False)

In [None]:
!head submission.csv

In [None]:

def calc_preds(coeffs,indeps): return torch.sigmoid(indeps@coeffs)


In [None]:
def init_coeffs(): return (torch.rand(n_coeff,1) * 0.1).requires_grad_()

trn_dep = trn_dep[:,None]
val_dep = val_dep[:,None]

In [None]:
val_dep.shape

In [None]:
coeffs = train_model(1000,lr=0.1)

In [None]:
def init_coeffs(n_hidden=20):
    layer1 = (torch.rand(n_coeff,n_hidden) - 0.5) / n_hidden
    layer2 = torch.rand(n_hidden,1)-0.3
    const = torch.rand(1)[0]
    return layer1.requires_grad_(),layer2.requires_grad_(),const.requires_grad_()

In [None]:
import torch.nn.functional as F

def calc_preds(coeffs,indeps):
    l1,l2,const = coeffs
    res = F.relu(indeps@l1)
    res = res@l2 + const
    return torch.sigmoid(res)

def update_coeffs(coeffs,lr):
    for layer in coeffs:
        layer.sub_(layer.grad * lr)

In [None]:
coeffs = train_model(lr=2)

In [None]:
acc(coeffs)

In [None]:
def init_coeffs():
    hiddens= [10,10]
    sizes = [n_coeff] + hiddens + [1]
    n = len(sizes)
    layers = [(torch.rand(sizes[i],sizes[i+1])-0.3) / sizes[i+1]*4 for i in range(n-1)]
    consts = [(torch.rand(1)[0]-0.5)*0.1 for i in range(n-1)]
    for l in layers+consts: l.requires_grad_()
    return layers,consts

In [None]:
def calc_preds(coeffs,indeps):
    layers,consts = coeffs
    n = len(layers)
    res = indeps
    for i,l in enumerate(layers):
        res = res@l + consts[i]
        if i!=n-1: res = F.relu(res)
    return torch.sigmoid(res)

In [None]:
def update_coeffs(coeffs,lr):
    layers,conts = coeffs
    for layer in layers+conts: layer.sub_(layer.grad * lr)

In [None]:
coeffs = train_model(lr=0.0002)

In [None]:
acc(coeffs)

FRAMEWORK

In [None]:
df = pd.read_csv('./Titanic/train.csv')

def add_features(df):
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC",B="ABC",C="ABC",D="DE",E="DE",F="FG",G="FG"))

add_features(df)

In [None]:
splits = RandomSplitter(seed=42)(df)

In [None]:
from fastai.tabular.all import *

dls = TabularPandas(df, splits=splits,
procs=[Categorify,FillMissing,Normalize]
,cat_names=["Sex","Pclass","Embarked","Deck"]
,cont_names=["Age","SibSp","Parch","LogFare"]
,y_names="Survived",y_block=CategoryBlock()
).dataloaders(path=".")

In [None]:
learn = tabular_learner(dls,metrics=accuracy,layers=[10,10])

learn.lr_find(suggest_funcs=(slide,valley))

In [None]:
learn.fit(16,lr=0.03)

In [None]:
tst_df = pd.read_csv('./Titanic/test.csv')
tst_df["Fare"] = tst_df.Fare.fillna(0)
add_features(tst_df)

tst_dl = learn.dls.test_dl(tst_df)

preds,_ = learn.get_preds(dl=tst_dl)
print(preds)
tst_df['Survived'] = (preds[:,0]>0.5).int()
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('submission.csv',index=False)

In [None]:
!tail submission.csv

In [None]:
import torch

# create two tensors
a = torch.tensor([[1, 2,3], [3, 4,3]])
b = torch.tensor([[5, 6,3], [7, 8,3]])

# perform matrix multiplication
c = torch.matmul(a, b)
c

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# load data
data = pd.read_csv('./Titanic/train.csv')
data.isna().sum()
modes = df.mode().iloc[0]
data.fillna(modes, inplace=True)
data['Fare'] = np.log(data['Fare'] + 1)  # take the log of the Fare column
X = data.drop(['Survived', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1).values
y = data['Survived'].values

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train)
print(y_train)
# define model architecture
class TabularModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TabularModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# create model
model = TabularModel(input_size=X_train.shape[1], hidden_size=64, output_size=1)

# define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# train model
for epoch in range(100):
    # convert data to PyTorch tensors
    inputs = torch.tensor(X_train.astype(np.float32))
    labels = torch.tensor(y_train.astype(np.float32))
    
    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = model(inputs)
    loss = criterion(outputs.squeeze(), labels)
    loss.backward()
    optimizer.step()

    # print statistics
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 100, loss.item()))

# evaluate model
with torch.no_grad():
    # convert data to PyTorch tensors
    inputs = torch.tensor(X_test.astype(np.float32))
    labels = torch.tensor(y_test.astype(np.float32))
    
    # make predictions
    outputs = model(inputs)
    predicted = (outputs > 0).squeeze().long()
    
    # calculate accuracy
    total = labels.size(0)
    correct = (predicted == labels).sum().item()
    accuracy = correct / total
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

RANDOM FOREST / BINARY SPLIT

In [None]:
%pip install seaborn

In [None]:
df = pd.read_csv('./Titanic/train.csv')
tst_df = pd.read_csv('./Titanic/test.csv')
modes = df.mode().iloc[0]

def proc_data(df):
    df['Fare'] = df.Fare.fillna(0)
    df.fillna(modes, inplace=True)
    df['LogFare'] = np.log1p(df['Fare'])
    df['Embarked'] = pd.Categorical(df.Embarked)
    df['Sex'] = pd.Categorical(df.Sex)
    

proc_data(df)
proc_data(tst_df)

In [None]:
cats=['Sex','Embarked']
conts=['Age','SibSp','Parch','LogFare','Pclass']
dep="Survived"

df.Sex.head()

In [None]:
df.Sex.cat.codes.head()

Binary Splits

In [None]:
import seaborn as sns

fig,axs = plt.subplots(1,2,figsize=(11,5))
sns.barplot(data=df,y=dep,x="Sex",ax=axs[0]).set(title="Survival Rate")
sns.countplot(data=df,x="Sex",ax=axs[1]).set(title="Histogram")

In [None]:
from numpy import random
from sklearn.model_selection import train_test_split

random.seed(42)
trn_df,val_df = train_test_split(df,test_size=0.2)
trn_df[cats] = trn_df[cats].apply(lambda x: x.cat.codes)
val_df[cats] = val_df[cats].apply(lambda x: x.cat.codes)


In [None]:
def xs_y(df):
    xs = df[cats+conts].copy()
    return xs,df[dep] if dep in df else None

trn_xs,trn_y = xs_y(trn_df)
val_xs,val_y = xs_y(val_df)



In [None]:
preds = val_xs.Sex == 0

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(val_y,preds)

In [None]:
df_fare = trn_df[trn_df.LogFare > 0]
fig,axs= plt.subplots(1,2,figsize=(11,5))
sns.boxenplot(data=df_fare,x=dep,y="LogFare",ax=axs[0])
sns.kdeplot(data=df_fare,x="LogFare",hue=dep,ax=axs[1])

In [None]:
preds = val_xs.LogFare > 2.7
mean_absolute_error(val_y,preds)

In [None]:
def _side_score(side,y):
    tot= side.sum()
    if tot <= 1: return 0
    return y[side].std() * tot

In [None]:
def score(col,y,split):
    lhs = col <= split
    return (_side_score(lhs,y) + _side_score(~lhs,y)) / len(y)

In [None]:
score(trn_xs["Sex"],trn_y,0.5)
score(trn_xs["Sex"],trn_y,2.7)

In [None]:
def iscore(nm,split):
    col = trn_xs[nm]
    return score(col,trn_y,split)

In [None]:
from ipywidgets import interact
interact(nm=conts,split=15.5)(iscore)

In [None]:
interact(nm=cats,split=2)(iscore)

In [None]:
nm = "Age"
col = trn_xs[nm]
unq = col.unique()
unq.sort()
unq

In [None]:
scores = np.array([score(col,trn_y,o) for o in unq if not np.isnan(o)])
unq[scores.argmin()]

In [None]:
def min_col(df,nm):
    col,y=df[nm],df[dep]
    unq= col.dropna().unique()
    scores = np.array([score(col,y,o) for o in unq if not np.isnan(o)])
    idx = scores.argmin()
    return unq[idx],scores[idx]

min_col(trn_df,"Age")

In [None]:
cols = cats + conts
{o: min_col(trn_df,o) for o in cols}