In [131]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [132]:
print(train.describe())
print(test.describe())
train = train.drop(['Name', 'Cabin'], axis=1)
test = test.drop(['Name', 'Cabin'], axis=1)



       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
       PassengerId      Pclass         Age       SibSp       Parch        Fare
co

In [133]:
for df in [train, test]:
    df['Sex_binary']=df['Sex'].map({'male':1, 'female':0})

features = ['Pclass', 'Age', 'Sex_binary', 'SibSp', 'Parch', 'Fare']

pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True)),
                 ('poly', PolynomialFeatures()),
                 ('scaler', RobustScaler()),
                ])

pipe.fit(train[features])
train_features = pipe.transform(train[features])
test_features = pipe.transform(test[features])

target = 'Survived'

In [134]:
tuned_parameters = [{'n_estimators': [5, 10, 100, 500], 'max_features': ['auto', 'sqrt', 2, 5, 10, 25], 'max_depth': [None, 3, 5, 10]}]

clf = GridSearchCV(
    RandomForestClassifier(), tuned_parameters
)

In [135]:
clf.fit(train_features, train[target])
print(clf.best_params_)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

        

{'max_depth': 5, 'max_features': 25, 'n_estimators': 100}


In [136]:
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 1.96, params))

0.813 (+/-0.064) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 5}
0.811 (+/-0.065) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 10}
0.822 (+/-0.063) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}
0.817 (+/-0.069) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 500}
0.782 (+/-0.051) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 5}
0.809 (+/-0.078) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 10}
0.820 (+/-0.052) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
0.814 (+/-0.071) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 500}
0.799 (+/-0.048) for {'max_depth': None, 'max_features': 2, 'n_estimators': 5}
0.807 (+/-0.061) for {'max_depth': None, 'max_features': 2, 'n_estimators': 10}
0.818 (+/-0.071) for {'max_depth': None, 'max_features': 2, 'n_estimators': 100}
0.813 (+/-0.064) for {'max_depth': None, 'max_features': 2, 'n_estimators': 50

[0.7733098989391751, 0.7745213734228862, 0.7822923859142552, 0.7833908731404181, 0.7980290000627707, 0.7991400414286611, 0.799146318498525, 0.8002699140041427, 0.8002699140041429, 0.8036218693114054, 0.8036344234511331, 0.8047454648170234, 0.8069801016885318, 0.8069863787583955, 0.8069926558282594, 0.8081099742640134, 0.8081288054736049, 0.809277509258678, 0.8103257799259305, 0.810369719414977, 0.8114305442219573, 0.8114556525014125, 0.8114933149205952, 0.812591802146758, 0.812591802146758, 0.8125980792166217, 0.8137091205825122, 0.8137467830016949, 0.8148138848785387, 0.8148327160881301, 0.814864101437449, 0.815956311593748, 0.8159625886636117, 0.8159939740129308, 0.8170547988199109, 0.8170610758897746, 0.8170799070993662, 0.8170987383089574, 0.8181595631159375, 0.8181595631159375, 0.8181721172556651, 0.8181783943255289, 0.8181909484652564, 0.8181909484652564, 0.8182035026049839, 0.8182160567447114, 0.8182223338145753, 0.818234887954303, 0.819295712761283, 0.8193019898311469, 0.819308

In [138]:
predictions = clf.predict(test_features)

In [139]:
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})

In [145]:
filename = 'data/Titanic Predictions 5.csv'

In [146]:
submission.to_csv(filename, index=False)

In [142]:
print('Saved File: ' + filename)

Saved File: Titanic Predictions 5.csv
