# Titanic Survival Prediction

Import all the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Read data

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [11]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Since 'Cabin' has alot of null values, it's better to drop it

In [7]:
train.drop('Cabin',axis=1,inplace=True)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 76.6+ KB


Checking no. of survivals

In [9]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

Since it doesn't seem like PassengerId and name is important for our model,we'll drop them( Also We can use the Title's from the name of the people for better predictions)

In [21]:
train.drop(['PassengerId','Name'],axis=1,inplace=True)

Now we'll change the string objects in Embarked and Sex into numbers for better analysis

In [34]:
train.loc[train['Sex'] == 'male', 'Sex'] = 0
train.loc[train['Sex'] == 'female', 'Sex'] = 1

In [28]:
train.loc[train['Embarked'] == 'S', 'Embarked'] = 0
train.loc[train['Embarked'] == 'C', 'Embarked'] = 1
train.loc[train['Embarked'] == 'Q', 'Embarked'] = 2

Checking importance of embarked

In [32]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
1,1,0.553571
2,2,0.38961
0,0,0.336957


checking the data again for  null data

In [45]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


I'll drop the ticket column as well

In [44]:
train.drop('Ticket',axis=1,inplace=True)

Filling the 2 null fields of Embarked by 0

In [51]:
train['Embarked'].fillna(0,inplace=True)

In [52]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(5), object(1)
memory usage: 55.8+ KB


Grouping the Age by Pclass

In [55]:
train.groupby('Pclass')['Age'].median()

Pclass
1    37.0
2    29.0
3    24.0
Name: Age, dtype: float64

Function to fill null fields in age column by using pclass

In [64]:
def impute_age(cols):
    Age=cols[0]
    Pclass=cols[1]
    if pd.isnull(Age):
        
       if Pclass is 1:
            return 37
       elif (Pclass==2):
            return 29
       else:
            return 24
    else:
        return Age
    
train['Age']=train[['Age','Pclass']].apply(impute_age,axis=1)

In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(5), object(1)
memory usage: 55.8+ KB


Since now we've cleared the data and now proceed to make the model

In [78]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
from sklearn.cross_validation import train_test_split

Splitting the data

In [81]:
X=train.drop('Survived',axis=1)
y=train['Survived']
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

Implementing Random forest Classifier and fitting the data

In [82]:
rf=RandomForestClassifier()

In [83]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Checking the score for X_test

In [84]:
rf.score(X_test,y_test)

0.80970149253731338

Now clearing the final test data same as the training data 

In [95]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [100]:
test.drop('Ticket',axis=1,inplace=True)

In [106]:
test.loc[test['Sex'] == 'male', 'Sex'] = 0
test.loc[test['Sex'] == 'female', 'Sex'] = 1

In [107]:
test.loc[test['Embarked'] == 'S', 'Embarked'] = 0
test.loc[test['Embarked'] == 'C', 'Embarked'] = 1
test.loc[test['Embarked'] == 'Q', 'Embarked'] = 2

In [111]:
test['Fare'].fillna(value=test['Fare'].mean(),inplace=True)

In [114]:
test.groupby('Pclass')['Age'].median()

Pclass
1    42.0
2    26.5
3    24.0
Name: Age, dtype: float64

In [115]:
test['Age']=test[['Age','Pclass']].apply(impute_age,axis=1)

Fitting the random forest classifier to the complete training data

In [122]:
rf1=RandomForestClassifier()
rf1.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Predicting the test dataset and saving it to csv file

In [124]:
predictions = rf1.predict(test)

In [131]:
pd.DataFrame(predictions).to_csv()

',0\n0,0\n1,0\n2,0\n3,0\n4,1\n5,0\n6,0\n7,0\n8,1\n9,0\n10,0\n11,0\n12,1\n13,0\n14,1\n15,1\n16,0\n17,1\n18,0\n19,0\n20,0\n21,0\n22,1\n23,0\n24,1\n25,0\n26,1\n27,0\n28,1\n29,0\n30,0\n31,0\n32,0\n33,0\n34,1\n35,1\n36,0\n37,0\n38,0\n39,1\n40,0\n41,1\n42,0\n43,1\n44,1\n45,0\n46,0\n47,0\n48,1\n49,0\n50,1\n51,0\n52,1\n53,1\n54,0\n55,0\n56,0\n57,0\n58,0\n59,1\n60,0\n61,0\n62,0\n63,1\n64,1\n65,1\n66,1\n67,0\n68,0\n69,1\n70,1\n71,0\n72,0\n73,0\n74,1\n75,0\n76,0\n77,1\n78,0\n79,1\n80,0\n81,0\n82,0\n83,0\n84,0\n85,0\n86,1\n87,0\n88,1\n89,1\n90,1\n91,0\n92,1\n93,0\n94,1\n95,0\n96,1\n97,0\n98,0\n99,0\n100,1\n101,0\n102,0\n103,0\n104,1\n105,0\n106,0\n107,0\n108,0\n109,0\n110,0\n111,1\n112,1\n113,1\n114,1\n115,0\n116,0\n117,1\n118,0\n119,1\n120,1\n121,0\n122,1\n123,0\n124,0\n125,1\n126,0\n127,1\n128,1\n129,0\n130,1\n131,0\n132,0\n133,0\n134,0\n135,0\n136,0\n137,0\n138,0\n139,0\n140,0\n141,1\n142,0\n143,0\n144,1\n145,0\n146,0\n147,0\n148,1\n149,0\n150,1\n151,0\n152,0\n153,0\n154,0\n155,0\n156,1\n157,0\