In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
%matplotlib notebook

In [3]:
df = pd.read_csv("train.csv")
print(df.columns)
df.head(10)



Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Data exploration

In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


We have lots of null values in cabin and a few in age column

In [6]:
survived = df[df['Survived'] == 1]

In [7]:
#Checking if gender has any role in survival chances 
gender_survived = survived.groupby("Sex")['Sex'].count()
gender_total = df.groupby("Sex")['Sex'].count()

survived_percent = gender_survived/gender_total
survived_percent

Sex
female    0.742038
male      0.188908
Name: Sex, dtype: float64

In [8]:
plt.figure()
sns.countplot(x = "Survived", hue = 'Sex', data = df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f3280077f40>

74.2% Women survived and only 18.8% men survived

In [9]:
#Embarked column tells us where the passenger boarded the boat. Checking whether it has
#any effect on survival chances
embark_survived = survived.groupby("Embarked")['Embarked'].count()
embark_total = df.groupby('Embarked')["Embarked"].count()

survived_percent_embark = embark_survived/embark_total
survived_percent_embark

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Embarked, dtype: float64

In [10]:
plt.figure()
sns.countplot(x = 'Survived', hue = 'Embarked', data = df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f327c703df0>

So embarked column has a little effect on the survival chances:
        
    C = Cherbourg = 55.3% chance
    Q = Queenstown = 38.9% chance
    S = Southampton = 33.69% chance

In [11]:
#Cabin column has around 687 null values and only 204 values present, it will be practical
#to drop this feature as it will hamper the performance of the model
df.drop('Cabin', axis = 1, inplace = True)

In [12]:
pclass_survived = survived.groupby("Pclass")['Pclass'].count()
pclass_total = df.groupby('Pclass')["Pclass"].count()

survived_percent_pclass = pclass_survived/pclass_total
survived_percent_pclass

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Pclass, dtype: float64

In [13]:
plt.figure()
sns.countplot(x = 'Survived', hue = 'Pclass', data = df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f327c6d3fd0>

As we see, Pclass 3 passengers had low chance of survival, while Pclass 1 passengers had high chance.

So wealth status played a huge role in Survival Chances

In [14]:
#SibSp is a discrete variable which shows the number of siblings or spouses passengers had
plt.figure()
plt.hist(df["SibSp"])
plt.title("SibSp distribution")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'SibSp distribution')

Right Skewed data...will require normalization

In [15]:
#Parch is a discrete variable showing no of Parents accompaning the children or 
#children accompanying parents
plt.figure()
plt.hist(df["Parch"])
plt.title("Parch distribution")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Parch distribution')

right skewed data, will require normalization

In [16]:
#Ticket
len(df["Ticket"].unique())

681

In [17]:
map_dict = df["Ticket"].value_counts().to_dict()
map_dict

{'CA. 2343': 7,
 '347082': 7,
 '1601': 7,
 'CA 2144': 6,
 '347088': 6,
 '3101295': 6,
 'S.O.C. 14879': 5,
 '382652': 5,
 'W./C. 6608': 4,
 '113781': 4,
 'LINE': 4,
 '113760': 4,
 '349909': 4,
 'PC 17757': 4,
 '19950': 4,
 '17421': 4,
 '2666': 4,
 '347077': 4,
 '4133': 4,
 '239853': 3,
 'SC/Paris 2123': 3,
 'C.A. 31921': 3,
 'C.A. 34651': 3,
 '29106': 3,
 '248727': 3,
 'PC 17572': 3,
 '363291': 3,
 '110413': 3,
 '371110': 3,
 '35273': 3,
 '345773': 3,
 '13502': 3,
 '347742': 3,
 '230080': 3,
 'PC 17760': 3,
 'PC 17755': 3,
 '24160': 3,
 'PC 17582': 3,
 '110152': 3,
 'F.C.C. 13529': 3,
 '19877': 2,
 '2659': 2,
 '11751': 2,
 '358585': 2,
 'S.O./P.P. 3': 2,
 'STON/O2. 3101279': 2,
 '113572': 2,
 '243847': 2,
 '7534': 2,
 '230433': 2,
 '19943': 2,
 'C.A. 33112': 2,
 'A/5. 3336': 2,
 '364516': 2,
 '17453': 2,
 '237736': 2,
 '367230': 2,
 '2627': 2,
 '349237': 2,
 '239865': 2,
 '2661': 2,
 'PC 17485': 2,
 '110465': 2,
 'W./C. 6607': 2,
 '231919': 2,
 'PC 17758': 2,
 '12749': 2,
 '11668': 2,
 

As we have 681 unique values in Ticket column, we will be keeping only top 10 frequent categories, and labelling the others in "other" column.

Source:
Kaggle: How to Handle large number of categorical values
(https://www.kaggle.com/getting-started/37489)

code credits:
(Video)Featuring Engineering- Handle Categorical Features Many Categories by Krish Naik 
(https://www.youtube.com/watch?v=MPnNC6kkNC4)

In [18]:
top_dict = df["Ticket"].value_counts().head(10).to_dict()
top_list = [x for x in top_dict]
top_list

['CA. 2343',
 '347082',
 '1601',
 'CA 2144',
 '347088',
 '3101295',
 'S.O.C. 14879',
 '382652',
 'W./C. 6608',
 '113781']

In [19]:
for i in top_list:
    df[i] = np.where(df["Ticket"]==i,1,0)
    

In [20]:
# AGE

plt.figure().set_size_inches(8, 4)
plt.subplot(1,2,1)
sns.distplot(a = df["Age"].dropna(), kde = True).set(yticklabels=[])
plt.title("Age distribution of all people")
plt.subplot(1,2,2)
sns.distplot(a = survived["Age"].dropna(), kde = True).set(yticklabels=[])
plt.title("Age distribution of survived people")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Age distribution of survived people')

This is a normal distribution. There is no need for normalization.
Scaling required

we have multiple missing values in the column, as this follows a normal distribution, we can replace the nan values with the mean or median

In [21]:
df_age = df[["Age", "Fare", "Pclass", "Sex"]]

In [22]:
plt.figure()
sns.heatmap(df_age.corr(), annot = True)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f327c510220>

In [23]:
plt.figure()
sns.boxplot(y = 'Age', x = "Pclass", data = df_age)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f327c3fa970>

In [24]:
plt.figure()
sns.boxplot(y = 'Age', x = "Sex", data = df_age)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f327c374460>

In [25]:
#As we see, we can easily distinguish the median age for each pclass,
#so replacing age with the medians
pclass_median = df_age.groupby("Pclass")["Age"].agg(np.median)
pclass_median

Pclass
1    37.0
2    29.0
3    24.0
Name: Age, dtype: float64

In [26]:
def age_replace(row):
    if row["Pclass"] == 1:
        row["Age"] = 37
    elif row["Pclass"] == 2:
        row["Age"] = 29
    elif row["Pclass"] == 3:
        row["Age"] = 24
    return row
df = df.apply(age_replace, axis = 1)

In [27]:
df.isnull().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        2
CA. 2343        0
347082          0
1601            0
CA 2144         0
347088          0
3101295         0
S.O.C. 14879    0
382652          0
W./C. 6608      0
113781          0
dtype: int64

## Handling Categorical Data

In [28]:
df = df.drop("Name", axis =1)

In [29]:
df = df.drop("Ticket", axis = 1)

In [30]:
#Gender one hot encoding
df_sex = pd.get_dummies(df["Sex"])
df = pd.concat([df_sex, df], axis = 1)
df.drop("Sex",axis = 1, inplace = True)

In [31]:
#Pclass one hot encoding
df_pclass = pd.get_dummies(df["Pclass"])
df = pd.concat([df_pclass, df], axis = 1)
df.drop("Pclass",axis = 1, inplace = True)

In [32]:
#Embarked one hot encoding
df_embarked = pd.get_dummies(df["Embarked"])
df = pd.concat([df_embarked, df], axis = 1)
df.drop("Embarked",axis = 1, inplace = True)

In [33]:
df.columns

Index([           'C',            'Q',            'S',              1,
                    2,              3,       'female',         'male',
        'PassengerId',     'Survived',          'Age',        'SibSp',
              'Parch',         'Fare',     'CA. 2343',       '347082',
               '1601',      'CA 2144',       '347088',      '3101295',
       'S.O.C. 14879',       '382652',   'W./C. 6608',       '113781'],
      dtype='object')

In [34]:
df.drop("PassengerId", axis = 1, inplace = True)

## Data Normalization

### Train test split

In [35]:
from sklearn.model_selection import train_test_split

X = df.drop("Survived", axis = 1)
y = df["Survived"]

X_train,X_test,y_train,y_test= train_test_split(X,y)

In [36]:
#Scaling and normalizing

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.fit_transform(X_test)

## Building models

In [37]:
#Decision Trees
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt_clf = DecisionTreeClassifier().fit(X_train_sc,y_train)
y_predict = dt_clf.predict(X_test_sc)
accuracy_DT = accuracy_score(y_test, y_predict)
accuracy_DT

0.8071748878923767

In [38]:
cv = cross_val_score(dt_clf,X_train_sc,y_train,cv=5)
print(cv)
print(cv.mean())

[0.80597015 0.79104478 0.76865672 0.84962406 0.78195489]
0.7994501178318931


In [39]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
max_acc = 0
k = 0
for i in range(1,50,2):
    knn_clf = KNeighborsClassifier(n_neighbors = i).fit(X_train_sc,y_train)
    y_predict = knn_clf.predict(X_test_sc)
    accuracy = accuracy_score(y_test, y_predict)
    if accuracy>max_acc:
        max_acc = accuracy
        k = i
    print("Accuracy score of k = {} is {}".format(i, accuracy))
print("Max accuracy score is {} for k = {}".format(max_acc, k))

Accuracy score of k = 1 is 0.7130044843049327
Accuracy score of k = 3 is 0.7847533632286996
Accuracy score of k = 5 is 0.8071748878923767
Accuracy score of k = 7 is 0.8026905829596412
Accuracy score of k = 9 is 0.8251121076233184
Accuracy score of k = 11 is 0.820627802690583
Accuracy score of k = 13 is 0.8251121076233184
Accuracy score of k = 15 is 0.8116591928251121
Accuracy score of k = 17 is 0.820627802690583
Accuracy score of k = 19 is 0.820627802690583
Accuracy score of k = 21 is 0.8251121076233184
Accuracy score of k = 23 is 0.8161434977578476
Accuracy score of k = 25 is 0.8251121076233184
Accuracy score of k = 27 is 0.8161434977578476
Accuracy score of k = 29 is 0.8071748878923767
Accuracy score of k = 31 is 0.8071748878923767
Accuracy score of k = 33 is 0.8071748878923767
Accuracy score of k = 35 is 0.8026905829596412
Accuracy score of k = 37 is 0.8026905829596412
Accuracy score of k = 39 is 0.7892376681614349
Accuracy score of k = 41 is 0.7847533632286996
Accuracy score of k =

In [40]:
knn_clf = KNeighborsClassifier(n_neighbors = 7).fit(X_train_sc,y_train)
cv = cross_val_score(knn_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.70895522 0.74626866 0.74626866 0.79699248 0.79699248]
0.7590954999438896


In [41]:
#Logistic regression
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(max_iter = 1000).fit(X_train_sc,y_train)
y_predict = lr_clf.predict(X_test_sc)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.7802690582959642


In [42]:
cv = cross_val_score(lr_clf,X_train_sc,y_train,cv=5)
print(cv)
print(cv.mean())

[0.8358209  0.76119403 0.74626866 0.84962406 0.76691729]
0.7919649870946021


In [43]:
#SVC
from sklearn.svm import SVC
svc_clf = SVC(kernel = 'linear').fit(X_train_sc,y_train)
y_predict = svc_clf.predict(X_test_sc)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.8161434977578476


In [44]:
cv = cross_val_score(svc_clf,X_train_sc,y_train,cv=5)
print(cv)
print(cv.mean())

[0.82089552 0.79104478 0.74626866 0.87218045 0.78195489]
0.802468858713949


In [45]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier().fit(X_train_sc, y_train)
y_predict = rf_clf.predict(X_test_sc)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.7982062780269058


In [46]:
cv = cross_val_score(rf_clf,X_train_sc,y_train,cv=5)
print(cv)
print(cv.mean())

[0.76119403 0.76119403 0.74626866 0.84962406 0.77443609]
0.7785433733587701


### Parameter Tuning

In [47]:
from sklearn.model_selection import GridSearchCV 
lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear', 'sag']}

lr_clf = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_lr_clf = lr_clf.fit(X_train_sc,y_train)
print('Best Score: ' + str(best_lr_clf.best_score_))
print('Best Parameters: ' + str(best_lr_clf.best_params_))

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.6s


Best Score: 0.8084614521378072
Best Parameters: {'C': 0.03359818286283781, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'sag'}


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.4s finished


In [48]:
y_predict = best_lr_clf.predict(X_test_sc)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.8251121076233184
