In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Feature Engineering

## Missing Data - Fillna

In [7]:
df_train.info(), df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

(None, None)

In [9]:
# Age
df_train['Age'].mean(), df_train['Age'].median()

(29.69911764705882, 28.0)

In [10]:
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)

In [11]:
df_test['Age'].fillna(df_train['Age'].median(), inplace=True)

In [14]:
# Embarked - mode
df_train['Embarked'].mode().iloc[0]

'S'

In [15]:
df_train['Embarked'].fillna(df_train['Embarked'].mode().iloc[0], inplace=True)
df_test['Embarked'].fillna(df_train['Embarked'].mode().iloc[0], inplace=True)

In [16]:
# Fare - median
df_train['Fare'].mean(), df_train['Fare'].median()

(32.2042079685746, 14.4542)

In [17]:
df_train['Fare'].fillna(df_train['Fare'].median(), inplace=True)
df_test['Fare'].fillna(df_train['Fare'].median(), inplace=True)

In [22]:
# Cabin
df_train['Cabin']=(~df_train['Cabin'].isna()).astype(int)
df_test['Cabin']=(~df_test['Cabin'].isna()).astype(int)

In [23]:
df_train.isna().sum(), df_test.isna().sum()

(PassengerId    0
 Survived       0
 Pclass         0
 Name           0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Ticket         0
 Fare           0
 Cabin          0
 Embarked       0
 dtype: int64,
 PassengerId    0
 Pclass         0
 Name           0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Ticket         0
 Fare           0
 Cabin          0
 Embarked       0
 dtype: int64)

# Binning

In [24]:
# Binnig of Age_cat
def get_age_cat(age):
    if age<18:
        return 'child'
    if age<50:
        return 'young'
    return 'old'

df_train['Age_cat'] = df_train['Age'].apply(get_age_cat)
df_test['Age_cat'] = df_test['Age'].apply(get_age_cat)

In [25]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_cat
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S,young
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,1,C,young
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S,young
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,1,S,young
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S,young


# Encoding

In [26]:
# Sex, Age_cat, Embarked
df_train['Sex'] = df_train['Sex'].map({'male':0, 'female':1})
df_test['Sex'] = df_test['Sex'].map({'male':0, 'female':1})

df_train['Embarked'] = df_train['Embarked'].map({'S':0, 'C':1, 'Q':2})
df_test['Embarked'] = df_test['Embarked'].map({'S':0, 'C':1, 'Q':2})

df_train['Age_cat'] = df_train['Age_cat'].map({'child':0, 'young':1, 'old':2})
df_test['Age_cat'] = df_test['Age_cat'].map({'child':0, 'young':1, 'old':2})

In [27]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_cat
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,1,1,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,1,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,0,0,1


In [28]:
selected_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Age_cat']

In [29]:
X = df_train[selected_columns]
y = df_train['Survived']

# Model Building

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.22, random_state=18)

In [31]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10, random_state=100)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10, random_state=100)

In [33]:
model.score(X_train,y_train)

0.9798270893371758

# Hyper-parameter Tuning: Random

In [39]:
from sklearn.model_selection import RandomizedSearchCV

In [40]:
params = {'max_depth':[1,2,3,4,5]
         ,'min_samples_leaf':[1,2,3,4]
         ,'min_samples_split':[3,4,5,6]}

In [41]:
random_cv = RandomizedSearchCV(RandomForestClassifier(random_state=100),
                              param_distributions=params, cv=4, verbose=2)
random_cv.fit(X,y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=3; total time=   0.2s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=3; total time=   0.2s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=3; total time=   0.2s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=3; total time=   0.2s
[CV] END max_depth=2, min_samples_leaf=3, min_samples_split=4; total time=   0.2s
[CV] END max_depth=2, min_samples_leaf=3, min_samples_split=4; total time=   0.2s
[CV] END max_depth=2, min_samples_leaf=3, min_samples_split=4; total time=   0.2s
[CV] END max_depth=2, min_samples_leaf=3, min_samples_split=4; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=3; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=3; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=3, min_samples_split=3; total time=   0.2s
[CV] END max_depth=5, min_samples_lea

RandomizedSearchCV(cv=4, estimator=RandomForestClassifier(random_state=100),
                   param_distributions={'max_depth': [1, 2, 3, 4, 5],
                                        'min_samples_leaf': [1, 2, 3, 4],
                                        'min_samples_split': [3, 4, 5, 6]},
                   verbose=2)

In [42]:
random_cv.best_score_

0.8204460065446613

In [43]:
random_cv.best_index_

2

# Make Predictions

In [44]:
yp = random_cv.predict(df_test[selected_columns])

In [45]:
yp

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# Submission

In [46]:
df_test['Survived'] = yp

In [47]:
df_test[['PassengerId', 'Survived']].to_csv('sub1.csv', index=False)