### Titanic: Machine Learning from Disaster

The legendary and classic Titanic ML competition

In [512]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

kaggle/input/titanic/train.csv
kaggle/input/titanic/test.csv
kaggle/input/titanic/gender_submission.csv


In [648]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [649]:
# Reading the train data and test data
train_data = pd.read_csv('kaggle/input/titanic/train.csv')
test_data = pd.read_csv('kaggle/input/titanic/test.csv')

In [650]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [651]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [652]:
# Number of missing values
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [653]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [656]:
# Values for learning
X = train_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
X_to_pred = test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [657]:
# Fill NaN values with median values in column 'Age' and with S (will be Southampton) in column 'Embarked'
X = X.fillna({'Age' : X.Age.median(),
              'Embarked' : 'S'})
X_to_pred = X_to_pred.fillna({'Age' : X_to_pred.Age.median(),
                              'Embarked' : 'S',
                              'Fare' : X_to_pred.Fare.median()})

In [658]:
# Convertation string variable 'Sex' into nominative variable (femail=0. male=1)
X = X.replace(to_replace=['female','male'],value=[0, 1])
X_to_pred = X_to_pred.replace(to_replace=['female','male'],value=[0, 1])

In [659]:
# Convertation string variable 'Embarked' into nominative variable by get_dummies
X = pd.get_dummies(X)
X_to_pred = pd.get_dummies(X_to_pred)

In [660]:
X.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [661]:
X_to_pred.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [662]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22.0,1,0,7.25,0,0,1
1,1,0,38.0,1,0,71.2833,1,0,0
2,3,0,26.0,0,0,7.925,0,0,1
3,1,0,35.0,1,0,53.1,0,0,1
4,3,1,35.0,0,0,8.05,0,0,1


In [663]:
X_to_pred.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [777]:
# Target variable
y_train = train_data.Survived

In [794]:
# Spliting train data into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [795]:
# Choosing a model
clf_rf = RandomForestClassifier()

In [796]:
# Playing with RandomizedSearch to see width and variation of optimal model's paramets
'''
from sklearn.model_selection import RandomizedSearchCV

parametrs_test = {'n_estimators' : [300, 400, 500, 600, 700, 800, 900],
                  'criterion' : ['gini', 'entropy'],
                  #'max_depth' : [20, 50],
                  'min_samples_split' : [20, 25],
                  'min_samples_leaf' : [20, 30]} 
random_search_cv_clf = RandomizedSearchCV(clf_rf, parametrs_test, cv=5, n_jobs=-1)

#for i in range(0, 1):
random_search_cv_clf.fit(X_train, y_train)
print(random_search_cv_clf.best_params_)
temp_clf_rf = random_search_cv_clf.best_estimator_
print('Train score is ' + str(temp_clf_rf.score(X_train, y_train)) +
'\n Train score is ' + str(temp_clf_rf.score(X_test, y_test)))
print('-------------------------------------------------------------------------')
'''

"\nfrom sklearn.model_selection import RandomizedSearchCV\n\nparametrs_test = {'n_estimators' : [300, 400, 500, 600, 700, 800, 900],\n                  'criterion' : ['gini', 'entropy'],\n                  #'max_depth' : [20, 50],\n                  'min_samples_split' : [20, 25],\n                  'min_samples_leaf' : [20, 30]} \nrandom_search_cv_clf = RandomizedSearchCV(clf_rf, parametrs_test, cv=5, n_jobs=-1)\n\n#for i in range(0, 1):\nrandom_search_cv_clf.fit(X_train, y_train)\nprint(random_search_cv_clf.best_params_)\ntemp_clf_rf = random_search_cv_clf.best_estimator_\nprint('Train score is ' + str(temp_clf_rf.score(X_train, y_train)) +\n'\n Train score is ' + str(temp_clf_rf.score(X_test, y_test)))\nprint('-------------------------------------------------------------------------')\n"

In [797]:
# Variation of paramers for grid_search
parametrs = {'n_estimators' : [400, 500],          
             'criterion' : ['gini', 'entropy'],
             'min_samples_split' : [4, 5, 10, 15],
             'min_samples_leaf' : [5, 10]}      

In [798]:
# Choosing grid_search for searching best model's parametrs and cross-validation
grid_search_cv_clf = GridSearchCV(clf_rf, parametrs, cv=5)

In [799]:
# Fiting grid_search by train data
grid_search_cv_clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [784]:
grid_search_cv_clf.best_params_

{'criterion': 'gini',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 400}

In [785]:
# Choosing best best parametrs
best_clf_rf = grid_search_cv_clf.best_estimator_

In [786]:
# Accurance score train data
best_clf_rf.score(X_train, y_train)

0.8665730337078652

In [787]:
# Accurance score test data
best_clf_rf.score(X_test, y_test)

0.8156424581005587

In [788]:
# Let's see which features are most important
feature_importances = best_clf_rf.feature_importances_

In [789]:
# Put features and their importances into DataFrame
feature_importances_df = pd.DataFrame({'feature' : list(X_train),
                                       'feature_importances' : feature_importances})

In [790]:
# Showing importance of features
feature_importances_df.sort_values('feature_importances', ascending=False)

Unnamed: 0,feature,feature_importances
1,Sex,0.43557
5,Fare,0.20099
2,Age,0.133121
0,Pclass,0.12488
3,SibSp,0.039637
4,Parch,0.027288
6,Embarked_C,0.016992
8,Embarked_S,0.016207
7,Embarked_Q,0.005316


In [791]:
# Making a binary prediction: 1 for survived, 0 for deceased)
y_pred = best_clf_rf.predict(X_to_pred)

In [792]:
len(y_pred)

418

In [793]:
# Making output (saving results in the csv-file)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
