### Titanic Survival Analysis

    survival	Survival	0 = No, 1 = Yes
    pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
    sex	Sex	
    Age	Age in years	
    sibsp	# of siblings / spouses aboard the Titanic	
    parch	# of parents / children aboard the Titanic	
    ticket	Ticket number	
    fare	Passenger fare	
    cabin	Cabin number	
    embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


#### Setting the view of Data frame to show all the columns

In [2]:
pd.set_option('max_columns',None)

#### Reading the train dataset

In [4]:
train = pd.read_csv('train.csv')
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### Train data descriptiona and information

In [5]:
train.shape

(891, 12)

In [7]:
train.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Abelson, Mrs. Samuel (Hannah Wizosky)",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


#### Finding the variables with null values

In [8]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### Embarked Variable null values filled with the mode

In [9]:
train['Embarked'].fillna(value=train['Embarked'].mode()[0],inplace=True)

#### Cabin, Ticket and Name variables dropped due to more no of unique and null values

In [10]:
train.drop(['Cabin','Ticket','Name'],axis=1,inplace=True)

#### Mapping the categorical variable Sex into numeric

In [11]:
train['Sex'] = train['Sex'].map({'male':1,'female':0})

#### Mapping the categorical variable Embarked into numeric

In [13]:
train['Embarked'] = train['Embarked'].map({'S':1,'C':2,'Q':3})

#### filling the Age null values with the median

In [14]:
train['Age'].fillna(value=train['Age'].median(),inplace=True)

#### Setting the target variable

In [16]:
y = train['Survived']
train.drop(['Survived'],axis=1,inplace=True)

#### Splitting the train and test sets

In [17]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(train, y, random_state=0, test_size=0.2)

### Applying the XGB Classifier for the survival analysis

In [18]:
from xgboost import XGBClassifier



In [19]:
model = XGBClassifier()
model.fit(train_X,train_Y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

#### Cross validating the accuracy

In [20]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(model,train_X,train_Y,scoring='accuracy')

In [21]:
score.mean()

0.80338971031450557

In [22]:
model2 = XGBClassifier()
model2.fit(train,y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

#### Reading the test prediction data and its analysis

In [26]:
test = pd.read_csv('test.csv')

In [27]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [28]:
test.drop(['Cabin','Ticket','Name'],axis=1,inplace=True)

In [29]:
test['Sex'] = test['Sex'].map({'male':1,'female':0})

In [30]:
test['Embarked'] = test['Embarked'].map({'S':1,'C':2,'Q':3})

In [31]:
test['Age'].fillna(value=test['Age'].median(),inplace=True)

In [32]:
test['Fare'].fillna(value=test['Fare'].median(),inplace=True)

In [33]:
predicted_values = model2.predict(test)

In [34]:
pred_df = pd.DataFrame(predicted_values, index=test["PassengerId"], columns=["Survived"])
pred_df.to_csv('submission.csv', header=True, index_label='PassengerId')