In [117]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [118]:
print(train.describe())
print(test.describe())
train = train.drop(['Name', 'Cabin'], axis=1)
test = test.drop(['Name', 'Cabin'], axis=1)



       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
       PassengerId      Pclass         Age       SibSp       Parch        Fare
co

In [94]:
train['Age'] = np.nan_to_num(train['Age'])
test['Age'] = np.nan_to_num(test['Age'])
test['Fare'] = np.nan_to_num(test['Fare'])

for df in [train, test]:
    df['Sex_binary']=df['Sex'].map({'male':1, 'female':0})

features = ['Pclass', 'Age', 'Sex_binary', 'SibSp', 'Parch', 'Fare']

pipe = Pipeline(['scaler', RobustScaler()])


pf = PolynomialFeatures()
pf.fit(train[features])
train_features = pd.DataFrame(pf.transform(train[features]))
test_features = pd.DataFrame(pf.transform(test[features]))

si = SimpleImputer(strategy='most_frequent', add_indicator=True)
si.fit(train[features])



print(test_features)


target = 'Survived'

      0    1     2    3    4    5         6    7      8    9   ...   18   19  \
0    1.0  3.0  34.5  1.0  0.0  0.0    7.8292  9.0  103.5  3.0  ...  1.0  0.0   
1    1.0  3.0  47.0  0.0  1.0  0.0    7.0000  9.0  141.0  0.0  ...  0.0  0.0   
2    1.0  2.0  62.0  1.0  0.0  0.0    9.6875  4.0  124.0  2.0  ...  1.0  0.0   
3    1.0  3.0  27.0  1.0  0.0  0.0    8.6625  9.0   81.0  3.0  ...  1.0  0.0   
4    1.0  3.0  22.0  0.0  1.0  1.0   12.2875  9.0   66.0  0.0  ...  0.0  0.0   
..   ...  ...   ...  ...  ...  ...       ...  ...    ...  ...  ...  ...  ...   
413  1.0  3.0   0.0  1.0  0.0  0.0    8.0500  9.0    0.0  3.0  ...  1.0  0.0   
414  1.0  1.0  39.0  0.0  0.0  0.0  108.9000  1.0   39.0  0.0  ...  0.0  0.0   
415  1.0  3.0  38.5  1.0  0.0  0.0    7.2500  9.0  115.5  3.0  ...  1.0  0.0   
416  1.0  3.0   0.0  1.0  0.0  0.0    8.0500  9.0    0.0  3.0  ...  1.0  0.0   
417  1.0  3.0   0.0  1.0  1.0  1.0   22.3583  9.0    0.0  3.0  ...  1.0  1.0   

      20       21   22   23       24   

In [95]:
tuned_parameters = [{'n_estimators': [5, 10, 100, 500], 'max_features': ['auto', 'sqrt', 2, 5, 10, 25], 'max_depth': [None, 3, 5, 50]}]

clf = GridSearchCV(
    RandomForestClassifier(), tuned_parameters
)

In [96]:
clf.fit(train_features, train[target])
print(clf.best_params_)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

        

{'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 100}
0.794 (+/-0.082) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 5}
0.801 (+/-0.065) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 10}
0.822 (+/-0.067) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}
0.816 (+/-0.078) for {'max_depth': None, 'max_features': 'auto', 'n_estimators': 500}
0.794 (+/-0.039) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 5}
0.806 (+/-0.051) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 10}
0.823 (+/-0.069) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
0.814 (+/-0.077) for {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 500}
0.796 (+/-0.038) for {'max_depth': None, 'max_features': 2, 'n_estimators': 5}
0.811 (+/-0.039) for {'max_depth': None, 'max_features': 2, 'n_estimators': 10}
0.805 (+/-0.054) for {'max_depth': None, 'max_features': 2, 'n_estimators': 100}
0.816 (+/-0.065)

In [None]:
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 1.96, params))

In [102]:
print(sorted(means))

[0.7677735233193145, 0.7801079656016572, 0.7834348126294646, 0.7901763856631725, 0.7912434875400163, 0.7923670830456343, 0.7924110225346809, 0.7924298537442722, 0.7935032326909799, 0.7935095097608438, 0.7935157868307074, 0.793528340970435, 0.7935534492498901, 0.7935660033896178, 0.7946707676856444, 0.7957567007720796, 0.7957629778419435, 0.7957692549118071, 0.795775531981671, 0.7968426338585148, 0.7980352771326344, 0.7980541083422259, 0.8013558470905782, 0.801393509509761, 0.8024480572468772, 0.8024731655263324, 0.8024919967359236, 0.802510827945515, 0.8025296591551063, 0.8025359362249702, 0.8035842068922227, 0.8035967610319503, 0.8036093151716779, 0.8036469775908607, 0.804732910677296, 0.8047705730964786, 0.8058251208335949, 0.8058439520431862, 0.8058690603226413, 0.8058690603226415, 0.805881614462369, 0.8069549934090766, 0.8069549934090766, 0.8069863787583955, 0.8069989328981233, 0.8070177641077144, 0.8080660347749671, 0.8080660347749671, 0.8080660347749671, 0.8080660347749671, 0.808

In [97]:
predictions = clf.predict(test_features)

In [98]:
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})

In [99]:
filename = 'Titanic Predictions 4.csv'

In [100]:
submission.to_csv(filename, index=False)

In [101]:
print('Saved File: ' + filename)

Saved File: Titanic Predictions 4.csv
