In [235]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
df = pd.read_csv('/Users/monikaju/Desktop/train.csv', index_col=0)

In [232]:
# Our data set contains 12 different columns
df.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'Fare_range'],
      dtype='object')

In [17]:
df.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# Featuring Engineering

In [9]:
#Knowing which variables are missing

df.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [234]:
#Replace missing values with the average age differentiated by Pclass

m = df['Age'].median()
df['Age'].fillna(m, inplace=True)

In [233]:
df['Age'].fillna(df.groupby('Pclass').transform('mean')['Age'], inplace=True)

In [23]:
# Replace with dummies 'Sex'

onehot = pd.get_dummies(df['Sex'])  # --> DF
onehot.head(3)

Unnamed: 0_level_0,female,male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,1,0
3,1,0


In [50]:
df2 = pd.concat([df, onehot], axis=1) # <-- concatenate columns
df2.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0


In [25]:
df2.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
female        0
male          0
dtype: int64

In [61]:
df2.shape

(891, 13)

In [65]:
# Change Embarked to dummies 

df2['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [67]:
df2['Survived'].groupby(df['Embarked']).mean()

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

In [70]:
onehot2 = pd.get_dummies(df['Embarked'])  # --> DF
onehot2.head(3)

Unnamed: 0_level_0,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,1
2,1,0,0
3,0,0,1


In [90]:
df3 = pd.concat([df2, onehot2], axis=1)
df3.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1,0,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,1,0,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0,0,1


In [158]:
df3['Fare'].max()

512.3292

In [None]:
# SCALING FARE

In [161]:
Fare_range = df3['Fare'].max() - df3['Fare'].min()
Fare_range

512.3292

In [176]:
df['Fare_range'] = ( df['Fare'] - df['Fare'].min() ) / Fare_range
x = df[['Fare_range']]

In [177]:
x.head(3)

Unnamed: 0_level_0,Fare_range
PassengerId,Unnamed: 1_level_1
1,2.8e-05
2,0.000272
3,3e-05


In [178]:
df4 = pd.concat([df3, x], axis=1)

In [240]:
df4.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male,C,Q,S,Fare_range,Name_Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1,0,0,1,2.8e-05,Mr.
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,1,0,0,0.000272,Mrs.
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0,0,1,3e-05,Miss.


In [226]:
df4['Name_Title'] = df4['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
df4['Name_Title'].value_counts()

Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
Capt.          1
Sir.           1
Lady.          1
Don.           1
the            1
Jonkheer.      1
Mme.           1
Ms.            1
Name: Name_Title, dtype: int64

In [241]:
# it seems to be a significant difference between Mr and Mrs or the lenght of the name (Taken from kaggle)

df4['Survived'].groupby(df4['Name_Title']).mean()

Name_Title
Capt.        0.000000
Col.         0.500000
Don.         0.000000
Dr.          0.428571
Jonkheer.    0.000000
Lady.        1.000000
Major.       0.500000
Master.      0.575000
Miss.        0.697802
Mlle.        1.000000
Mme.         1.000000
Mr.          0.156673
Mrs.         0.792000
Ms.          1.000000
Rev.         0.000000
Sir.         1.000000
the          1.000000
Name: Survived, dtype: float64

In [243]:
df4['Name_Len'] = df4['Name'].apply(lambda x: len(x))

In [396]:
df4.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male,C,Q,S,Fare_range,Name_Title,Name_Len
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1,0,0,1,2.8e-05,Mr.,23
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,1,0,0,0.000272,Mrs.,51
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0,0,1,3e-05,Miss.,22


In [245]:
df4['Survived'].groupby(pd.qcut(df4['Name_Len'],5)).mean()

Name_Len
(11.999, 19.0]    0.220588
(19.0, 23.0]      0.301282
(23.0, 27.0]      0.319797
(27.0, 32.0]      0.442424
(32.0, 82.0]      0.674556
Name: Survived, dtype: float64

# Modeling/Logistic Regression 1

In [351]:
df2.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0


In [352]:
X = df2 [['Pclass','Age','Fare','female','male']]

In [353]:
y= df2 [['Survived']]

In [354]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [355]:
X_train.shape, Y_train.shape

((668, 5), (668, 1))

In [356]:
X_test.shape, y_test.shape

((223, 5), (223, 1))

In [357]:
m = LogisticRegression(random_state=10, max_iter=300)

In [None]:
#FIRST_REGRESSION (TRAINING DATA)

In [358]:
m.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=10, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [359]:
a = m.predict(X_train)

In [360]:
len(a)

668

In [361]:
#W:Coeficients 
m.coef_

array([[-1.04462993e+00, -2.46996884e-02,  2.60849316e-04,
         1.19769989e+00, -1.19736919e+00]])

In [362]:
#B:Intercept
m.intercept_

array([2.89457082])

# --SCORE 1

In [363]:
# SCORE FOR THE TRAIN GROUP
m.score(X_train, y_train)

0.7784431137724551

In [364]:
b = m.predict(X_test)

In [120]:
len(b)

223

In [365]:
# SCORE FOR THE TEST GROUP

m.score(X_test, y_test)

0.8430493273542601

In [None]:
# SCORE FOR THE WHOLE DATA

In [366]:
d = m.predict(X)

In [367]:
len(d)

891

In [368]:
 m.score(X,y)

0.7946127946127947

# Modeling/Logistic Regression 2: Including the embarking variables/ Fare range

In [372]:
df4.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male,C,Q,S,Fare_range,Name_Title,Name_Len
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1,0,0,1,2.8e-05,Mr.,23
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,1,0,0,0.000272,Mrs.,51
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0,0,1,3e-05,Miss.,22


In [373]:
X2 = df4 [['Pclass','Age','Fare_range','female','male','C','Q','S']]

In [374]:
y2= df4 ['Survived']

In [375]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=10)

In [376]:
m.fit(X2_train,y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=10, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [377]:
c=m.predict(X2_train)

In [378]:
len(c)

668

In [379]:
#COEFICIENTS
m.coef_

array([[-1.04413401e+00, -2.45461993e-02, -7.15488530e-04,
         1.18837491e+00, -1.18811663e+00,  1.48075243e-01,
         1.23357494e-01, -4.90277847e-01]])

In [380]:
#INTECEPT
m.intercept_

array([3.20970479])

# --SCORE 2

In [381]:
m.score(X2_train, y2_train)

0.7754491017964071

In [382]:
m.score(X2_test,y2_test)

0.8340807174887892

In [383]:
m.score(X2,y2)

0.7901234567901234

# Modeling/Logistic Regression 3: Including the lenght of the name

In [386]:
X3 = df4[['Pclass','Age','Fare_range','female','male','C','Q','S','Name_Len']]

In [387]:
y3= df4 ['Survived']

In [388]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=10)

In [389]:
m.fit(X3_train,y3_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=10, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [391]:
m.coef_

array([[-1.02565794e+00, -2.53666392e-02, -7.89512069e-04,
         1.10947208e+00, -1.10817832e+00,  1.41612128e-01,
         1.68172902e-01, -5.37290865e-01,  2.10935771e-02]])

# -- SCORE 3

In [392]:
m.score(X3_train, y3_train)

0.7754491017964071

In [393]:
m.score(X3_test, y3_test)

0.852017937219731

In [394]:
m.score(X3,y3)

0.7946127946127947

# Optimization