In [84]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math

%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


**Seperate Data into X_train, y_train & X_test**

Gender column changed from categorical to numerical

Changed from pd.df to np array

In [85]:
X_df = pd.read_csv('/kaggle/input/titanic/train.csv')
y_train = X_df['Survived']
X_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [86]:
X_df['Sex'].replace(['female','male'],[1,0],inplace=True)
#Female = 1, Male = 0
X_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [87]:
X_train = X_df[['Pclass','Sex', 'Parch']]
y_train = y_train.to_numpy()
X_train = X_train.to_numpy()

In [88]:
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
test_df['Sex'].replace(['female','male'],[1,0],inplace=True)
#Female = 1, Male = 0
X_test = test_df[['Pclass','Sex', 'Parch']]
X_test = X_test.to_numpy()

**Functions for applying logistic regression w/ gradient descent**

In [89]:
def sigmoid(z):   
    
    g = 1 / (1 + np.exp(-z))
    
    return g


In [90]:
def compute_cost(X, y, w, b, *argv):
    m, n = X.shape

    loss_sum = 0 

    for i in range(m): 

        z_wb = 0 

        for j in range(n): 

            z_wb_ij = w[j]*X[i][j] 
            z_wb += z_wb_ij

        z_wb += b 

        f_wb = sigmoid(z_wb) 
        loss = -y[i] * np.log(f_wb) - (1 - y[i]) * np.log(1 - f_wb)

        loss_sum += loss

    total_cost = (1 / m) * loss_sum  

    return total_cost

In [91]:
def compute_gradient(X, y, w, b, *argv): 
        m, n = X.shape
        dj_dw = np.zeros(w.shape)
        dj_db = 0.

        
        for i in range(m):

            z_wb = 0

            for j in range(n): 

                z_wb_ij = X[i, j] * w[j]
                z_wb += z_wb_ij

            z_wb += b
            f_wb = sigmoid(z_wb)
            
            dj_db_i = f_wb - y[i]
            dj_db += dj_db_i

            for j in range(n):

                dj_dw_ij =  dj_db_i * X[i, j]
                dj_dw[j] += dj_dw_ij

        dj_dw = dj_dw / m
        dj_db = dj_db / m

        return dj_db, dj_dw

In [92]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_): 
    

    m = len(X)
    
    J_history = []
    w_history = []
    
    for i in range(num_iters):

        dj_db, dj_dw = gradient_function(X, y, w_in, b_in, lambda_)   

        w_in = w_in - alpha * dj_dw               
        b_in = b_in - alpha * dj_db              
       
        if i<100000:      
            cost =  cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)

        if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w_in, b_in, J_history, w_history 

**Set random initial parameters for w & b for gradient descent**

In [93]:
np.random.seed(1)
initial_w = 0.01 * (np.random.rand(3) - 0.5)
initial_b = -8

# Some gradient descent settings
iterations = 10000
alpha = 0.001
print(initial_w)
w,b, J_history,_ = gradient_descent(X_train ,y_train, initial_w, initial_b, 
                                   compute_cost, compute_gradient, alpha, iterations, 0)

[-0.00082978  0.00220324 -0.00499886]
Iteration    0: Cost     3.07   
Iteration 1000: Cost     2.27   
Iteration 2000: Cost     1.55   
Iteration 3000: Cost     1.22   
Iteration 4000: Cost     1.15   
Iteration 5000: Cost     1.11   
Iteration 6000: Cost     1.08   
Iteration 7000: Cost     1.05   
Iteration 8000: Cost     1.03   
Iteration 9000: Cost     1.00   
Iteration 9999: Cost     0.97   


In [94]:
def predict(X, w, b): 

    m, n = X.shape   
    p = np.zeros(m)


    for i in range(m):   

        f_wb = sigmoid(np.dot(X[i], w) + b)

        #Threshold for predicting whether or not passenger survives
        p[i] = f_wb >= 0.6


    return p

**Testing training set accuracy & predictions on test set**

If passenger has >60% chance of surviving, deemed to survive

In [95]:
p = predict(X_train, w,b)
print('Train Accuracy: %f'%(np.mean(p == y_train) * 100))

Train Accuracy: 61.279461


In [96]:
p = predict(X_test, w,b)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': p})
print(output)
output.to_csv('submission.csv', index=False)

     PassengerId  Survived
0            892       0.0
1            893       1.0
2            894       0.0
3            895       0.0
4            896       1.0
..           ...       ...
413         1305       0.0
414         1306       0.0
415         1307       0.0
416         1308       0.0
417         1309       0.0

[418 rows x 2 columns]
