In [216]:
import csv
import math

def import_data(pathname, train_or_test):
    with open(pathname, newline='') as csvfile:
        filereader = csv.reader(csvfile, delimiter=',')
    
        survived = list()
        fare = list()
        first_class = list()
        female_flag = list()

        for idx, row in enumerate(filereader):
            if idx == 0:
                fare_ix = row.index('Fare')
                fc_ix = row.index('Pclass')
                sex_ix = row.index('Sex')
                if train_or_test == 'train':
                    surv_ix = row.index('Survived')
            if idx != 0:
                if train_or_test == 'train':
                    survived.append(int(row[surv_ix]))
                else:
                    survived.append('?')
                first_class.append(int(row[fc_ix]=='1'))
                try:
                    fare.append(float(row[fare_ix]))
                except:
                    fare.append(14.45) #training data median fare
                female_flag.append(int(row[sex_ix]=='female'))

                
    return survived, fare, first_class, female_flag

def sigmoid_function(x):
    return 1/(1 + math.exp(-x))


In [218]:
# import data
survived_tr, fare_tr, first_class_tr, female_flag_tr = import_data('titanic_train.csv', 'train')
survived_tst, fare_tst, first_class_tst, female_flag_tst = import_data('titanic_test.csv', 'test')

In [277]:
# train logistic regression model

w1, w2, w3, b = 0, 0, 0, 0 #initialize
learning_rate = .06

for k in range(10_001):
    J, dw1, dw2, dw3, db = 0, 0, 0, 0, 0 #initialize 
    for i in range(len(survived_tr)):
        y  = survived_tr[i]
        z  = (w1*(fare_tr[i]/513) + #min max scale fare using train max
              w2*first_class_tr[i] + 
              w3*female_flag_tr[i] + 
              b)
        a  = sigmoid_function(z)
        dz = a - y
        J   += -(y*math.log(a) + (1-y)*math.log(1-a))
        dw1 += (fare_tr[i]/513)*dz 
        dw2 += first_class_tr[i]*dz
        dw3 += female_flag_tr[i]*dz
        db  += dz

    dw1 /= len(survived_tr)
    dw2 /= len(survived_tr)
    dw3 /= len(survived_tr)
    db  /= len(survived_tr)
    J   /= len(survived_tr)

    w1 -= learning_rate*(dw1)
    w2 -= learning_rate*(dw2)
    w3 -= learning_rate*(dw3)
    b  -= learning_rate*(db)
    
    if k%1_000 == 0:
        print(
            'iteration: ',k,
            ' loss: ',round(J,3),
            '|| w1={}, w2={}, w3={}, b={}'
                .format(round(w1,3),round(w2,3),round(w3,3),round(b,3))
             )

iteration:  0  loss:  0.693 || w1=0.0, w2=0.002, w3=0.005, b=-0.007
iteration:  1000  loss:  0.482 || w1=0.227, w2=1.137, w3=2.208, b=-1.596
iteration:  2000  loss:  0.477 || w1=0.32, w2=1.416, w3=2.519, b=-1.836
iteration:  3000  loss:  0.477 || w1=0.383, w2=1.495, w3=2.593, b=-1.9
iteration:  4000  loss:  0.477 || w1=0.436, w2=1.514, w3=2.613, b=-1.92
iteration:  5000  loss:  0.477 || w1=0.483, w2=1.516, w3=2.619, b=-1.926
iteration:  6000  loss:  0.477 || w1=0.527, w2=1.513, w3=2.62, b=-1.929
iteration:  7000  loss:  0.477 || w1=0.568, w2=1.509, w3=2.62, b=-1.93
iteration:  8000  loss:  0.477 || w1=0.607, w2=1.504, w3=2.619, b=-1.931
iteration:  9000  loss:  0.477 || w1=0.643, w2=1.5, w3=2.618, b=-1.932
iteration:  10000  loss:  0.477 || w1=0.678, w2=1.495, w3=2.618, b=-1.933


In [199]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [259]:
df1 = pd.read_csv('titanic_train.csv')
df2 = pd.read_csv('titanic_test.csv')

In [260]:
# do same preprocessing on fare
df1.Fare.fillna(14.45, inplace=True)
df2.Fare.fillna(14.45, inplace=True)
df1['fare'] = df1['Fare']/513
df2['fare'] = df2['Fare']/513

# first class indicator
df1['first_class'] = (df1['Pclass'] == 1).astype(int)
df2['first_class'] = (df2['Pclass'] == 1).astype(int)

# female indicator
df1['female_flag'] = (df1['Sex'] == 'female').astype(int)
df2['female_flag'] = (df2['Sex'] == 'female').astype(int)

In [261]:
X_train = df1.loc[:, ['fare', 'first_class', 'female_flag']]
y_train = df1['Survived']

model = LogisticRegression(random_state=333, solver='liblinear')
model.fit(X_train, y_train)



### Logistic regression using sklearn

In [287]:
print('w1: ', model.coef_[0][0])
print('w2: ', model.coef_[0][1])
print('w3: ', model.coef_[0][2])
print('b: ', model.intercept_[0])

w1:  0.6759643072666862
w2:  1.3928607842089231
w3:  2.4987085910576283
b:  -1.8429052148114269


### Logistic regression from scratch

In [288]:
print('w1: ', w1)
print('w2: ', w2)
print('w3: ', w3)
print('b: ', b)

w1:  0.6778369918314936
w2:  1.4953049780875323
w3:  2.6177557917653225
b:  -1.9325952928155834
