# Titanic: Machine Learning from Disaster

In [1]:
# https://www.kaggle.com/c/titanic

In [2]:
import sklearn
import pandas as pd

In [220]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [222]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,Braund,Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,Cumings,Mrs. John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,Heikkinen,Miss. Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,Futrelle,Mrs. Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S
5,0,3,Allen,Mr. William Henry,male,35.0,0,0,373450,8.05,,S


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
train_dummies = pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])
test_dummies = pd.get_dummies(test, columns=['Sex', 'Pclass', 'Embarked'])

In [126]:
len(train_dummies)

891

## Preprocess data

In [8]:
# Extract features
X_train = train_dummies.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
X_test = test_dummies.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [216]:
# Extract targets
y_train = train_dummies['Survived']
y_train[:3]

1    3
2    1
3    3
Name: Survived, dtype: int64

In [132]:
# Fill NA values
from sklearn.preprocessing import Imputer
# Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
imp=Imputer(missing_values="NaN", strategy="mean",axis=0)
imp.fit(X_train)
X_train=imp.fit_transform(X_train).ravel()
X_train



array([ 22.,   1.,   0., ...,   0.,   1.,   0.])

In [163]:
# Scale features
from sklearn.preprocessing import StandardScaler
import numpy as np
# StandardScaler(copy=True, with_mean=True, with_std=True)
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)
# Fit logistic regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
X_scaled=X_scaled.reshape(891,-1)



## Train model and make predictions

In [202]:
# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, 
#                    fit_intercept=True, intercept_scaling=1, class_weight=None, 
#                    random_state=None, solver='liblinear', max_iter=100,
#                    multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
model = RandomForestClassifier().fit(X_scaled, y_train)

k_fold = KFold(n=10, shuffle=True, random_state=100)

In [203]:
from sklearn.metrics import make_scorer
scores = cross_val_score(RandomForestClassifier(),
                         X_scaled, y_train, 
                         scoring = make_scorer(accuracy_score),
                         cv = k_fold)
pipeline = Pipeline(steps=[
                           ('scale', scaler),
                           ('model', RandomForestClassifier())])
from sklearn.grid_search import GridSearchCV
param = {
    
    'model__max_depth': (2,3,4,5,6,7,8,9)
}

grid = GridSearchCV(pipeline, param, 
                    scoring=make_scorer(accuracy_score),
                    cv=k_fold)
model = grid.fit(X_scaled, y_train)

In [204]:
model.best_score_

0.80000000000000004

In [205]:
model.cv

sklearn.cross_validation.KFold(n=10, n_folds=3, shuffle=True, random_state=100)

In [207]:
model.best_params_

{'model__max_depth': 4}

## Estimate quality

In [223]:
# Estimate quality
from sklearn.metrics import accuracy_score
# accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)
predicted = grid.best_estimator_.predict(X_scaled)
predicted[:5]

array([3, 1, 3, 3, 3])

## Create submission 

In [224]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))