# Logistic Regression

### importing libraries

In [192]:
import numpy as np 
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

### importing data

In [193]:
test = pd.read_csv("D:/titanic/test.csv")
train = pd.read_csv('D:/titanic/train.csv')

In [194]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [195]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Cleaning data

In [196]:
train1 = train['Survived']
train1.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [197]:
train2 = train.drop('Survived', axis =1)
train2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [198]:
np.sum(train2.isna())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [199]:
train2['Family_members'] = train2['SibSp'] + train2['Parch'] #making a new variable family_members to check if the person was alone or with family 

In [200]:

train2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_members
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [201]:
#creating dummies for all categorical values
embarked = pd.get_dummies(train2.Embarked, prefix = 'Embarked')
embarked.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [202]:
sex = pd.get_dummies(train2.Sex)
sex.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [203]:
Pclass = pd.get_dummies(train2.Pclass, prefix = 'Pclass')
Pclass.head()


Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [204]:
train2.Age = train2.fillna(train2.Age.mean())
np.sum(train2.isna())

PassengerId         0
Pclass              0
Name                0
Sex                 0
Age                 0
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin             687
Embarked            2
Family_members      0
dtype: int64

In [205]:
train2 = train2.drop('Ticket', axis=1)

In [206]:
train2 = train2.drop('Cabin', axis=1)

In [207]:
train2 = train2.drop('Name', axis=1)

In [208]:
train2 = train2.drop('PassengerId', axis=1)

In [209]:
train2 = train2.drop('Sex', axis=1)
train2 = train2.drop('Embarked', axis=1)

In [210]:
train2 = train2.drop('Pclass', axis=1)
train2 = train2.drop('SibSp', axis=1)
train2 = train2.drop('Parch', axis=1)

In [211]:
train2 = train2.drop('Age', axis=1)

In [212]:
train2.head()

Unnamed: 0,Fare,Family_members
0,7.25,1
1,71.2833,1
2,7.925,0
3,53.1,1
4,8.05,0


In [213]:
#filling na values in age with its mean
age = train['Age']

age.fillna(age.mean(),inplace=True)
age.mean()

29.699117647058763

In [214]:
age.isna().sum()

0

In [215]:
result = pd.concat([train2, sex,Pclass,embarked,age], axis=1)
result.head()

Unnamed: 0,Fare,Family_members,female,male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Age
0,7.25,1,0,1,0,0,1,0,0,1,22.0
1,71.2833,1,1,0,1,0,0,1,0,0,38.0
2,7.925,0,1,0,0,0,1,0,0,1,26.0
3,53.1,1,1,0,1,0,0,0,0,1,35.0
4,8.05,0,0,1,0,0,1,0,0,1,35.0


In [216]:
train = train.drop('Survived',axis =1)
train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Combining all data and cleaning it

In [217]:
combined = pd.concat([train,test],keys =['train','test'])
combined.shape

(1309, 11)

In [218]:
age =combined['Age']

age.fillna(age.mean(),inplace=True)
age.mean()

29.854794622673253

In [219]:
combined['Family_members'] = combined['SibSp'] + combined['Parch']
embarked = pd.get_dummies(combined.Embarked, prefix = 'Embarked')
embarked.head()
sex = pd.get_dummies(combined.Sex)
sex.head()
Pclass = pd.get_dummies(combined.Pclass, prefix = 'Pclass')
Pclass.head()
combined = combined.drop('Ticket', axis=1)
combined = combined.drop('Cabin', axis=1)
combined = combined.drop('Name', axis=1)
combined = combined.drop('PassengerId', axis=1)
combined = combined.drop('Sex', axis=1)
combined = combined.drop('Embarked', axis=1)
combined = combined.drop('Pclass', axis=1)
combined = combined.drop('SibSp', axis=1)
combined = combined.drop('Parch', axis=1)
combined = combined.drop('Age', axis=1)


In [220]:
combined.head()

Unnamed: 0,Unnamed: 1,Fare,Family_members
train,0,7.25,1
train,1,71.2833,1
train,2,7.925,0
train,3,53.1,1
train,4,8.05,0


In [221]:
Pclass.head()

Unnamed: 0,Unnamed: 1,Pclass_1,Pclass_2,Pclass_3
train,0,0,0,1
train,1,1,0,0
train,2,0,0,1
train,3,1,0,0
train,4,0,0,1


In [222]:
result = pd.concat([combined, sex,Pclass,embarked,age], axis=1)
result.head()

Unnamed: 0,Unnamed: 1,Fare,Family_members,female,male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Age
train,0,7.25,1,0,1,0,0,1,0,0,1,22.0
train,1,71.2833,1,1,0,1,0,0,1,0,0,38.0
train,2,7.925,0,1,0,0,0,1,0,0,1,26.0
train,3,53.1,1,1,0,1,0,0,0,0,1,35.0
train,4,8.05,0,0,1,0,0,1,0,0,1,35.0


### Spliting train and test

In [223]:
X = result.loc['train']
X.head()

Unnamed: 0,Fare,Family_members,female,male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Age
0,7.25,1,0,1,0,0,1,0,0,1,22.0
1,71.2833,1,1,0,1,0,0,1,0,0,38.0
2,7.925,0,1,0,0,0,1,0,0,1,26.0
3,53.1,1,1,0,1,0,0,0,0,1,35.0
4,8.05,0,0,1,0,0,1,0,0,1,35.0


In [224]:
Y = train1
Y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [225]:
X_test = result.loc['test']

In [226]:
x = np.array(X)
y = np.array(Y)
x_test = np.array(X_test)

In [227]:
x.shape

(891, 11)

### Sigmoid function

In [228]:
def sig(x):
    return 1/(1+np.exp(-x))

### Defining Logistic Regression
As we know that we are trying to find a decision boundary using logistic regression, a line which will help us differntiate to whether classify it into one class or another. In our function it gives out values 0 or 1 depending on that if the person survived or not. 
We take a linear function :=>
<b>z = w.X+ b</b> 
and then to limit this between 0 and 1 we apply a sigmoid function 
<b>1/(1+np.exp(-z))</b>
Now to find out appropriate values of W and b we minimise the loss function by differntiating it with W and b. 
After this we update the value of W : W-alpha*dW where dW = dJ/dW = (sig(z)-y)Z
and b : b -alpha*db where db = dJ/db = sig(z)-y


In [255]:
i = 0
W = np.array([[1,2,3,4,4,3,5,6,3,2,5]])
b = np.array([1])

n = len(y)
alpha = 0.0000000005
iteration = 1000
#LogisticRegression(alpha,iteration)

#def LogisticRegression(alpha,iteration):
for i in range(iteration):
    pred = sig(np.dot(x,W.T)+ b)
    
    J = np.sum((pred-y).T)/2/n
    print(J)
    print(i)
    W = W - (alpha*sum(np.dot((pred-y).T,x)))
    b = b -(alpha*sum(pred-y))

print(pred)

274.49999999999983
0
274.49999999999983
1
274.4999999999998
2
274.4999999999998
3
274.4999999999997
4
274.4999999999997
5
274.49999999999966
6
274.49999999999966
7
274.4999999999996
8
274.4999999999996
9
274.4999999999995
10
274.4999999999995
11
274.4999999999995
12
274.49999999999943
13
274.49999999999943
14
274.4999999999994
15
274.49999999999926
16
274.49999999999926
17
274.4999999999992
18
274.4999999999991
19
274.4999999999991
20
274.499999999999
21
274.49999999999886
22
274.49999999999875
23
274.4999999999987
24
274.4999999999986
25
274.49999999999847
26
274.49999999999835
27
274.4999999999982
28
274.49999999999807
29
274.4999999999979
30
274.4999999999977
31
274.4999999999975
32
274.49999999999727
33
274.4999999999971
34
274.4999999999968
35
274.49999999999653
36
274.49999999999625
37
274.4999999999959
38
274.4999999999955
39
274.4999999999951
40
274.49999999999466
41
274.49999999999426
42
274.49999999999375
43
274.4999999999932
44
274.4999999999925
45
274.49999999999187
46
274.

215.6583388612581
371
215.41236850526548
372
215.17777181209104
373
214.95228936374906
374
214.733692910236
375
214.52018725462887
376
214.31018231631603
377
214.10164923670072
378
213.8914285809641
379
213.67486171771256
380
213.4460376587802
381
213.19889298172066
382
212.92927085192852
383
212.63744156714594
384
212.3294884972598
385
212.01580656824675
386
211.70719853288935
387
211.41142772888335
388
211.13220360992705
389
210.87002504949047
390
210.62338885305755
391
210.38956759008028
392
210.16504416802198
393
209.94612471372588
394
209.73009908768165
395
209.51658880676044
396
209.30791882757995
397
209.1076533787641
398
208.91814825569256
399
208.73902357612454
400
208.56733209293608
401
208.39862775843292
402
208.22794671018823
403
208.05039480657902
404
207.86155519678076
405
207.6580303332154
406
207.43825282332517
407
207.2033577294432
408
206.957559184404
409
206.70742799763227
410
206.45998917621117
411
206.2203886692924
412
205.99024426409343
413
205.76731229348374
414


135.380332375893
738
135.11162139259088
739
134.84279654485493
740
134.57419856927692
741
134.30605483282258
742
134.03846922833742
743
133.77142584720445
744
133.50480347087225
745
133.2383967609333
746
132.9719400738137
747
132.7051306821302
748
132.4376493608538
749
132.16917740670553
750
131.89940999181033
751
131.62806624828832
752
131.35489667465208
753
131.07968842382957
754
130.80226885951737
755
130.52250752856963
756
130.24031645910463
757
129.95564851571652
758
129.6684934735151
759
129.3788715440078
760
129.08682430374233
761
128.7924033116584
762
128.49565708847524
763
128.19661748307925
764
127.89528667870434
765
127.59162613447963
766
127.28554860087623
767
126.97691402621523
768
126.66552975772463
769
126.35115501848297
770
126.03350928133433
771
125.7122839030398
772
125.3871562336988
773
125.05780535900033
774
124.72392863444603
775
124.38525820001664
776
124.0415766994507
777
123.69273146352269
778
123.33864645862597
779
122.97933136752661
780
122.61488727793002
781


### Applyling Logistic Regression
change the values of alpha and iterations to change the values of W and b. These values are for alpha = 0.000000005 and iteratons =173

# after doing various iterations the lowest value of J was 59.50390228324221


In [242]:
y_pred = sig(np.dot(x_test,W.T)+ b)