In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

# Get the Data

In [3]:
titanic = pd.read_csv('titanic_train.csv')

In [4]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic.shape

(891, 12)

In [6]:
titanic['Age'].dtypes

dtype('float64')

# Missing Data

In [7]:
print(titanic.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Roughly 20 percent of Age data is missing. We will replace these missing data with some form of imputation. Looking at the Cabin column, it looks like we are just missing too much of that data to do something useful with at a basic level. 

# Data Cleaning
We want to fill in missing age data instead of just dropping the missing age data rows. One way to do this is by filling in the mean age of all the passengers (imputation).

In [8]:
titanic.groupby('Pclass')['Age'].mean()

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [9]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass==1:
            return 38.23
        elif Pclass == 2:
            return 29.88
        else:
            return 25.14
    else:
        return Age
        

In [10]:
titanic['Age'] = titanic[['Age','Pclass']].apply(impute_age,axis=1)

In [11]:
print(titanic.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Now, drop the Cabin column and the row in Embarked that is NaN

In [12]:
titanic.drop('Cabin',axis=1,inplace=True)

In [13]:
titanic.dropna(inplace=True)

In [14]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


## Converting Categorical Features 

In [15]:
sex = pd.get_dummies(titanic['Sex'],drop_first=True)

In [16]:
embarked = pd.get_dummies(titanic['Embarked'],drop_first=True)

In [17]:
titanic.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace= True)

In [18]:
titanic = pd.concat([titanic,sex,embarked],axis = 1)

In [19]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


In [20]:
titanic.shape

(889, 10)

## Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train,X_test,y_train,y_test = train_test_split(titanic.drop('Survived',axis=1),titanic['Survived'],test_size=0.2,random_state=101)

## Training and Predicting

In [30]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
forest.fit(X_train,y_train)
forest.score(X_train, y_train)

0.9831223628691983

In [31]:
Predict = forest.predict(X_test)

In [32]:
print(classification_report(y_test,Predict))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       107
           1       0.85      0.72      0.78        71

    accuracy                           0.84       178
   macro avg       0.84      0.82      0.82       178
weighted avg       0.84      0.84      0.83       178



In [33]:
print("Accuracy: ",forest.score(X_test,y_test))

Accuracy:  0.8370786516853933
