In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
dataframe_train = pd.read_csv('train.csv')
dataframe_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
dataframe_train.shape

(891, 12)

In [6]:
dataframe_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
dataframe_train.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [8]:
dataframe_train.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [9]:
dataframe_train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
dataframe_train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
dataframe_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
dataframe_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [13]:
# All these columns seems to be irrelevant. Though you can add SibSp and Parch in your data.
columns_to_drop = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin']

In [14]:
dataframe_train = dataframe_train.drop(columns_to_drop, axis = 1)
dataframe_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.925,S
3,1,1,female,35.0,53.1,S
4,0,3,male,35.0,8.05,S


In [14]:
dataframe_train[dataframe_train['Age'].isna()]

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
5,0,3,male,,8.4583,Q
17,1,2,male,,13.0000,S
19,1,3,female,,7.2250,C
26,0,3,male,,7.2250,C
28,1,3,female,,7.8792,Q
...,...,...,...,...,...,...
859,0,3,male,,7.2292,C
863,0,3,female,,69.5500,S
868,0,3,male,,9.5000,S
878,0,3,male,,7.8958,S


In [15]:
# Smart way of replacing null values in age column by looking into Pclass as reference column
dataframe_train.groupby('Pclass').mean()[['Age']]

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,38.233441
2,29.87763
3,25.14062


In [16]:
def age_approx(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1 :
            return 39
        elif Pclass == 2:
            return 30
        else:
            return 24
    else:
        return Age

In [17]:
dataframe_train['Age'] = dataframe_train[['Age','Pclass']].apply(age_approx, axis = 1)
dataframe_train

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.2500,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.9250,S
3,1,1,female,35.0,53.1000,S
4,0,3,male,35.0,8.0500,S
...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,S
887,1,1,female,19.0,30.0000,S
888,0,3,female,24.0,23.4500,S
889,1,1,male,26.0,30.0000,C


In [18]:
dataframe_train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    2
dtype: int64

In [19]:
dataframe_train.dropna(inplace = True)
dataframe_train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
dtype: int64

In [20]:
dataframe_train.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
Fare        float64
Embarked     object
dtype: object

In [24]:
# Both the columns are ordinal variables so applying one hot encoding
dataframe_train_one_hot = pd.get_dummies(dataframe_train, columns = ['Sex','Embarked'])

In [25]:
dataframe_train_one_hot

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,7.2500,0,1,0,0,1
1,1,1,38.0,71.2833,1,0,1,0,0
2,1,3,26.0,7.9250,1,0,0,0,1
3,1,1,35.0,53.1000,1,0,0,0,1
4,0,3,35.0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,13.0000,0,1,0,0,1
887,1,1,19.0,30.0000,1,0,0,0,1
888,0,3,24.0,23.4500,1,0,0,0,1
889,1,1,26.0,30.0000,0,1,1,0,0


In [26]:
dataframe_train_one_hot.columns

Index(['Survived', 'Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [27]:
X = dataframe_train_one_hot[['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y = dataframe_train_one_hot['Survived']

In [28]:
X.shape,y.shape

((889, 8), (889,))

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [31]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((666, 8), (223, 8), (666,), (223,))

In [36]:
# from sklearn.preprocessing import StandardScaler, MinMaxScaler

# age_mm_scaler = MinMaxScaler()
# fare_std_scaler = StandardScaler()

# X_train['Age'] = age_mm_scaler.fit_transform(X_train[['Age']])
# X_train['Fare'] = fare_std_scaler.fit_transform(X_train[['Fare']])
# X_train

In [32]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [33]:
logreg.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [34]:
logreg.coef_, logreg.intercept_

(array([[-1.19366481e+00, -3.99939895e-02, -9.04326477e-06,
          1.65811229e+00, -9.57708878e-01,  3.72821345e-01,
          3.74283332e-01, -4.67012611e-02]]),
 array([3.26724539]))

In [35]:
y_pred = logreg.predict(X_test)

In [36]:
#Predicting probabilities
logreg.predict_proba(X_test)

array([[0.08590364, 0.91409636],
       [0.27459404, 0.72540596],
       [0.18980958, 0.81019042],
       [0.84349884, 0.15650116],
       [0.31893929, 0.68106071],
       [0.37808289, 0.62191711],
       [0.04132709, 0.95867291],
       [0.54542498, 0.45457502],
       [0.31892407, 0.68107593],
       [0.23662474, 0.76337526],
       [0.69661429, 0.30338571],
       [0.69227536, 0.30772464],
       [0.78989856, 0.21010144],
       [0.29085925, 0.70914075],
       [0.55537034, 0.44462966],
       [0.87633106, 0.12366894],
       [0.07282842, 0.92717158],
       [0.90704556, 0.09295444],
       [0.57497615, 0.42502385],
       [0.41636015, 0.58363985],
       [0.41639843, 0.58360157],
       [0.62030184, 0.37969816],
       [0.07990143, 0.92009857],
       [0.88060116, 0.11939884],
       [0.93073985, 0.06926015],
       [0.82120298, 0.17879702],
       [0.60628248, 0.39371752],
       [0.93073549, 0.06926451],
       [0.77633262, 0.22366738],
       [0.90704599, 0.09295401],
       [0.

In [41]:
#Using probabilities to set a new threshold. Sklearn's default threshold is 0.5
y_pred = [0 if x[0] > 0.4 else 1 for x in logreg.predict_proba(X_test)]

In [42]:
from sklearn import metrics

In [43]:
metrics.accuracy_score(y_test, y_pred)

0.7892376681614349

In [44]:
metrics.confusion_matrix(y_test,y_pred)

array([[127,  14],
       [ 33,  49]])

In [45]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       141
           1       0.78      0.60      0.68        82

    accuracy                           0.79       223
   macro avg       0.79      0.75      0.76       223
weighted avg       0.79      0.79      0.78       223



In [46]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [47]:
logreg.predict(test_data)

ValueError: could not convert string to float: 'Kelly, Mr. James'

In [48]:
dataframe_test = test_data.drop(columns_to_drop, axis = 1)
dataframe_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,male,34.5,7.8292,Q
1,3,female,47.0,7.0,S
2,2,male,62.0,9.6875,Q
3,3,male,27.0,8.6625,S
4,3,female,22.0,12.2875,S


In [49]:
dataframe_test[dataframe_test['Age'].isna()]

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
10,3,male,,7.8958,S
22,1,female,,31.6833,S
29,3,male,,21.6792,C
33,3,female,,23.4500,S
36,3,female,,8.0500,S
...,...,...,...,...,...
408,3,female,,7.7208,Q
410,3,female,,7.7500,Q
413,3,male,,8.0500,S
416,3,male,,8.0500,S


In [50]:
dataframe_test['Age'] = dataframe_test[['Age','Pclass']].apply(age_approx, axis = 1)
dataframe_test

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,male,34.5,7.8292,Q
1,3,female,47.0,7.0000,S
2,2,male,62.0,9.6875,Q
3,3,male,27.0,8.6625,S
4,3,female,22.0,12.2875,S
...,...,...,...,...,...
413,3,male,24.0,8.0500,S
414,1,female,39.0,108.9000,C
415,3,male,38.5,7.2500,S
416,3,male,24.0,8.0500,S


In [51]:
# Both the columns are ordinal variables so applying one hot encoding
dataframe_test_one_hot = pd.get_dummies(dataframe_test, columns = ['Sex','Embarked'])

In [53]:
dataframe_test_one_hot.columns

Index(['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [54]:
logreg.predict(dataframe_test_one_hot)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [55]:
dataframe_test_one_hot.isna().sum()

Pclass        0
Age           0
Fare          1
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [56]:
dataframe_test_one_hot.dropna(inplace=True)

In [57]:
logreg.predict(dataframe_test_one_hot)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,