In [152]:
#implementing LR from scratch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [153]:
def read_data():
    df = pd.read_csv('titanic_dataset.csv')
    return df.to_numpy()

In [154]:
def normalize(data):
    minimums = np.min(data, axis = 0)
    maximums = np.max(data, axis = 0)
    rng = maximums - minimums
    normalized_data = 1 - ((maximums - data)/rng)
    return normalized_data

In [155]:
def visualize(X, y, thetas):
    x_0 = X[np.where(y==0.0)]
    x_1 = X[np.where(y==1.0)]
    
    plt.scatter([x_0[:,1]], [x_0[:,2]], c = 'b', label = 'y = 0')
    plt.scatter([x_1[:,1]], [x_1[:,2]], c = 'r', label = 'y = 1')
    
    x1 = np.arange(0, 1, 0.1)
    x2 = -(thetas[0,0] + thetas[0,1]*x1)/thetas[0,2]
    plt.plot(x1, x2, c = 'k', label = 'reg line')
    
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.legend()
    
    plt.show()

In [156]:
def logistc_function(thetas, X):
    #sygmoid
    return 1.0/(1 + np.exp(- np.dot(X, thetas.T)))

In [157]:
def cost_func(thetas, X, y):
    log_func_value = logistc_function(thetas, X)
    step1 = y * np.log(log_func_value)
    step2 = (1 - y) * np.log(1 - log_func_value)
    return np.mean(-step1 - step2)

In [158]:
def log_gradient(thetas, X, y): 
    calc_1 = logistc_function(thetas, X) - y.reshape(X.shape[0],1) 
    calc_f = np.dot(calc_1.T, X) 
    return calc_f 

In [159]:
def gadient_decent(X, y, thetas, learning_rate, convergance_criteria):
    cost = cost_func(thetas , X, y)
    change = 1
    iter_count = 1
    
    while(change > convergance_criteria):
        old_cost = cost
        thetas = thetas - (learning_rate * log_gradient(thetas, X, y))
        cost = cost_func(thetas, X, y)
        change = old_cost - cost
        iter_count += 1
        
    return thetas , iter_count

In [162]:
def predict(thetas, X):
    prob = logistc_function(thetas, X)
    predicted_value = np.where(prob >= 0.1, 1, 0) #decision boundary
    return np.squeeze(predicted_value)

In [163]:
#main cell

dataset = read_data()
normalized = normalize(dataset)
learning_rate = 0.01
convergance_criteria = 0.001
X = normalized[:,:-1]
X = np.hstack((np.matrix(np.ones(X.shape[0])).T,X))
y = normalized[:,-1]

thetas = np.matrix(np.zeros(X.shape[1]))

(final_thetas, iter_count) = gadient_decent(X, y, thetas, learning_rate, convergance_criteria)

y_pred = predict(final_thetas, X)
corrects =  np.sum(y == y_pred)
total = y.__len__()
print("corrects : ",corrects)
print("total : ", total)
print("Accuracy : ", corrects/total *100)

corrects :  722
total :  890
Accuracy :  81.12359550561797
