In [168]:
#importing libraries
import numpy as np
import pandas as pd
from scipy.optimize import minimize, fmin_tnc

In [169]:
#Removing warnings
import warnings
warnings.filterwarnings('ignore')

In [170]:
# Importing training and testing sets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [171]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [172]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [173]:
#finding null values
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [174]:
# 0 for female 1 for male - Label Encoding
for i in range(train_df.shape[0]):
    if( train_df.Sex[i]=='female'): train_df.Sex[i] = 0
    else : train_df.Sex[i] =1

In [175]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [176]:
df = train_df.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [177]:
# Dropping useless columns (Dropped cabin because it had many null values)
df.drop('PassengerId',axis=1,inplace=True)
df.drop('Name', axis=1,inplace=True)
df.drop('Ticket',axis=1,inplace=True)
df.drop('Cabin',axis=1,inplace=True)

In [178]:
# 0 for C, 1 for Q, 2 for S => S is Maximum, so null value replaced by S - Label Encoding
for i in range(train_df.shape[0]): 
    if( df.Embarked[i]== 'C'): df.Embarked[i] = 0
    elif(df.Embarked[i]== 'Q') : df.Embarked[i] = 1
    else : df.Embarked[i] = 2

In [179]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [180]:
# Age has 177 null values, replacing with the mean value
mean = df.Age.mean(skipna=True)
print(mean)

29.69911764705882


In [181]:
df.Age.fillna(mean,inplace=True)

In [182]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [183]:
# Ready for modelling
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [184]:
X = df.drop('Survived',axis=1).to_numpy()
print(X)

[[3 1 22.0 ... 0 7.25 2]
 [1 0 38.0 ... 0 71.2833 0]
 [3 0 26.0 ... 0 7.925 2]
 ...
 [3 0 29.69911764705882 ... 2 23.45 2]
 [1 1 26.0 ... 0 30.0 0]
 [3 1 32.0 ... 0 7.75 1]]


In [185]:
X = np.c_[np.ones((X.shape[0])),X] #Adding bias column

In [186]:
print(X.shape)

(891, 8)


In [187]:
Y = df.Survived.to_numpy().reshape(X.shape[0],1)

In [188]:
print(Y.shape)

(891, 1)


In [189]:
# Using logistic regression
def sigmoid(x,theta): 
    z = np.dot(x,theta)
    z = z.astype(float)
    return (1/(1+np.exp(-z)))


In [190]:
def hypothesis(x,theta):
    return sigmoid(x,theta)

In [191]:
hypothesis(X,theta).shape

(891, 1)

In [192]:
def cost(theta,x,y):
    m = x.shape[0]
    h  = hypothesis(x,theta)
    return -(1/m)*np.sum(y*np.log(h) +(1-y)*np.log(1-h))

In [193]:
cost(theta,X,Y)

0.6931471805599454

In [194]:
def gradient(theta,x,y):
    m = x.shape[0]
    h = hypothesis(x,theta)
    return (1/m)*np.dot(x.T, h-y)

In [195]:
gradient(theta,X,Y)

array([[0.11616161616161617],
       [0.40572390572390576],
       [0.20145903479236815],
       [3.891058130322843],
       [0.07968574635241303],
       [0.01234567901234568],
       [-2.4739110549943892],
       [0.24298540965207635]], dtype=object)

In [198]:
# Using optimization function 
theta = np.zeros((X.shape[1],1))
def optimize(x,y,theta):
    ans = fmin_tnc(func=cost,x0=theta,fprime=gradient,approx_grad=True,args=(x,y.flatten()))
    return ans[0]
parameters = optimize(X,Y,theta)

In [199]:
print(parameters)

[ 1.41845695e+00  1.92790896e-03 -2.47086771e+00 -9.75186884e-03
 -5.80960006e-01 -7.39419600e-01  2.53843815e-02 -2.36187401e-01]


In [200]:
def prediction(h):
    out = []
    for i in range(len(h)):
        if h[i] >= 0.5 : out.append(1)
        else: out.append(0)
    return out
    

In [201]:
y_pred = prediction(hypothesis(X,parameters))
print(y_pred)

[0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 

In [202]:
# finding accuracy of the model
accuracy=0
for i in range(len(y_pred)):
    if( y_pred[i] == Y[i]) : accuracy+=1


In [203]:
print(accuracy*100/len(Y))

78.67564534231201


In [204]:
# Applying the model to test cases
test_df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [205]:
df2= test_df.copy()

In [206]:
# 0 for female 1 for male
for i in range(df2.shape[0]):
    if( df2.Sex[i]=='female'): df2.Sex[i] = 0
    else : df2.Sex[i] =1

In [207]:
df2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S


In [208]:
df2.drop('PassengerId',inplace=True,axis=1)
df2.drop('Name',inplace=True,axis=1)
df2.drop('Ticket',inplace=True,axis=1)
df2.drop('Cabin',inplace=True,axis=1)

In [210]:
df2.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [214]:
# Filling both null Age and Fare with mean values
meanAge = df2.Age.mean(skipna=True)
meanFare = df2.Fare.mean(skipna=True)

In [215]:
df2.Age.fillna(meanAge,inplace=True)

In [216]:
df2.Fare.fillna(meanFare,inplace=True)

In [217]:
df2.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [218]:
# 0 for C, 1 for Q, 2 for S => S is Maximum, so null value replaced by S
for i in range(df2.shape[0]):
    if( df2.Embarked[i]== 'C'): df2.Embarked[i] = 0
    elif(df2.Embarked[i]== 'Q') : df2.Embarked[i] = 1
    else : df2.Embarked[i] = 2

In [219]:
X_test = df2.to_numpy()
print(X_test)

[[3 1 34.5 ... 0 7.8292 1]
 [3 0 47.0 ... 0 7.0 2]
 [2 1 62.0 ... 0 9.6875 1]
 ...
 [3 1 38.5 ... 0 7.25 2]
 [3 1 30.272590361445783 ... 0 8.05 2]
 [3 1 30.272590361445783 ... 1 22.3583 0]]


In [220]:
X_test = np.c_[np.ones((X_test.shape[0])),X_test]

In [221]:
test_pred = prediction(hypothesis(X_test,parameters))
print(test_pred)

[0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 

In [224]:
test_df['Survived_Prediction'] = test_pred

In [225]:
test_df.head() # Final predicted data on the test set

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived_Prediction
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


In [None]:
#The END
#AUTHOR  - Praveen Singh