In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import feature_process_helper
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

## RF 18

#### Load data & transform variables

In [11]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket', 'SibSp', 'Parch'])

#### Tune hyper-parameters

In [5]:
rf14 = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf14,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [6]:
print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.83950617284
{'min_samples_split': 4, 'n_estimators': 400, 'criterion': 'gini', 'min_samples_leaf': 1}


#### Fit model

In [7]:
rf14 = RandomForestClassifier(criterion='gini', 
                             n_estimators=400,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf14.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf14.oob_score_ 

0.8350


#### Obtain cross-validation score with optimal hyperparameters

In [4]:
scores1 = cross_val_score(rf14, train.iloc[:, 2:], train.iloc[:, 1], n_jobs=-1)
scores1.mean()

0.83838383838383834

#### Inspect feature ranking

In [8]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf14.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,Name_Len,0.113751
1,Fare,0.107715
33,Name_Title_Mr.,0.105001
0,Age,0.102882
11,Sex_male,0.086788
12,Sex_female,0.078397
7,Ticket_Len,0.03797
8,Pclass_3,0.036294
35,Name_Title_Miss.,0.029315
41,Fam_Size_Big,0.024041


#### Generate submission file

In [9]:
predictions = rf14.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test18.csv'), sep=",", index = False)

Leaderboard score: 0.77990  

## RF 19

In [27]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [18]:
np.shape(train)

(891, 37)

In [19]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [20]:
print(gs.best_score_)
print(gs.best_params_)

0.83164983165
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [28]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8215


In [None]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

In [29]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_testss.csv'), sep=",", index = False)

score: 0.79904

## RF 20

In [24]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.lda(train, test, train.iloc[:, 1])
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [25]:
np.shape(train)

(891, 36)

In [26]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [27]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [28]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8283


In [29]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
10,Sex_male,0.140371
11,Sex_female,0.138155
5,0,0.134577
23,Name_Title_Mr.,0.106221
0,Name_Len,0.08159
7,Pclass_3,0.053532
6,Ticket_Len,0.040853
15,Cabin_Letter_n,0.040356
33,Fam_Size_Big,0.03024
31,Fam_Size_Nuclear,0.028742


In [30]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test20.csv'), sep=",", index = False)

In [33]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=500,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8316


In [34]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test22.csv'), sep=",", index = False)

## RF 23

In [36]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [37]:
np.shape(train)

(891, 34)

In [38]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [39]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 5}


In [40]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [41]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
9,Sex_female,0.177362
21,Name_Title_Mr.,0.14072
1,Fare,0.106389
8,Sex_male,0.090796
0,Age,0.075417
2,Name_Len,0.06748
5,Pclass_3,0.061468
31,Fam_Size_Big,0.034148
6,Pclass_1,0.033386
13,Cabin_Letter_n,0.031946


In [42]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test23.csv'), sep=",", index = False)

score: 0.80383

## RF 24

In [44]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
for i in [train, test]:
    i['Fam_Size'] = i['SibSp']+i['Parch']
    i['Ticket_Len'] = i['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket', 'SibSp', 'Parch'])

In [45]:
np.shape(train)

(891, 32)

In [46]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [47]:
print(gs.best_score_)
print(gs.best_params_)

0.829405162738
{'min_samples_split': 10, 'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}


In [48]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8227


In [49]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
9,Sex_male,0.116926
10,Sex_female,0.115533
1,Fare,0.113527
22,Name_Title_Mr.,0.101934
0,Age,0.095598
2,Name_Len,0.092359
6,Pclass_3,0.058144
4,Fam_Size,0.052039
5,Ticket_Len,0.044522
14,Cabin_Letter_n,0.035665


In [50]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test24.csv'), sep=",", index = False)

score: 0.77

## RF 24

In [53]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [54]:
np.shape(train)

(891, 43)

In [55]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [56]:
print(gs.best_score_)
print(gs.best_params_)

0.83950617284
{'min_samples_split': 4, 'n_estimators': 700, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [57]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=700,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8384


In [58]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
1,Fare,0.12746
0,Age,0.1252
2,Name_Len,0.122841
30,Name_Title_Mr.,0.082411
8,Sex_male,0.074802
9,Sex_female,0.072543
4,Ticket_Len,0.043962
5,Pclass_3,0.036189
22,Cabin_Letter_n,0.024189
38,Fam_Size_Nuclear,0.022195


In [59]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test24.csv'), sep=",", index = False)

score: 0.77033

## RF 25

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [60]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=500,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [61]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test25.csv'), sep=",", index = False)

score: 0.81340

## RF 26

In [62]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=500,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8350


In [63]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test26.csv'), sep=",", index = False)

score: 0.77990

## RF 27

In [4]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [5]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [6]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 10, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [9]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=500,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8283


In [10]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test28.csv'), sep=",", index = False)

score: .77

## RF 29

In [20]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [25]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [26]:
print(gs.best_score_)
print(gs.best_params_)

0.83950617284
{'min_samples_split': 4, 'n_estimators': 700, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [23]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=700,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8328


In [24]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test34.csv'), sep=",", index = False)

score: 0.81340

## RF 25

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [60]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=500,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [61]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test25.csv'), sep=",", index = False)

score: 0.81340

## RF 36

In [32]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(train['Age'].mean())
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [18]:
np.shape(train)

(891, 37)

In [33]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [34]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 16, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}


In [36]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=1000,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8339


In [37]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
23,Name_Title_Mr.,0.112752
1,Fare,0.111009
10,Sex_male,0.100946
11,Sex_female,0.100348
2,Name_Len,0.094834
0,Age,0.084043
6,Ticket_Len,0.050457
7,Pclass_3,0.049251
15,Cabin_Letter_n,0.032286
25,Name_Title_Miss.,0.030424


In [38]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test36.csv'), sep=",", index = False)

score: 0.79904

## RF 37

In [39]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [40]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [41]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [42]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [43]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test37.csv'), sep=",", index = False)

score: .77

## RF 38

In [126]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [51]:
np.shape(train)

(891, 49)

In [47]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [48]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 12, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}


In [127]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=1000,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8339


In [52]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
36,Name_Title_Mr.,0.124768
10,Sex_female,0.123612
9,Sex_male,0.114358
1,Name_Len,0.097597
0,Fare,0.096677
6,Pclass_3,0.045402
38,Name_Title_Miss.,0.033037
5,Ticket_Len,0.032207
28,Cabin_Letter_n,0.0304
37,Name_Title_Mrs.,0.029873


In [128]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test60.csv'), sep=",", index = False)

score: 0.80861

## RF 39

In [54]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
#test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [51]:
np.shape(train)

(891, 49)

In [55]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [56]:
print(gs.best_score_)
print(gs.best_params_)

0.846240179574
{'min_samples_split': 12, 'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}


In [57]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8373


In [58]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
31,Name_Title_Mr.,0.12775
9,Sex_male,0.117048
10,Sex_female,0.109127
1,Name_Len,0.103448
0,Fare,0.099746
6,Pclass_3,0.042916
32,Name_Title_Mrs.,0.03182
5,Ticket_Len,0.031763
23,Cabin_Letter_n,0.031427
33,Name_Title_Miss.,0.031422


In [59]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test39.csv'), sep=",", index = False)

score: 0.80383

## RF 40

In [123]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [76]:
np.shape(train)

(891, 46)

In [77]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [78]:
print(gs.best_score_)
print(gs.best_params_)

0.845117845118
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [124]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8361


In [80]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
7,Sex_female,0.139713
33,Name_Title_Mr.,0.114777
1,Name_Len,0.09893
6,Sex_male,0.092425
0,Fare,0.090744
3,Pclass_3,0.045756
2,Ticket_Len,0.038454
34,Name_Title_Mrs.,0.038107
35,Name_Title_Miss.,0.032036
25,Cabin_Letter_n,0.031778


In [125]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test40.csv'), sep=",", index = False)

score: 0.80383

## RF 41

In [2]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [3]:
np.shape(train)

(891, 49)

In [4]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [5]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 12, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}


In [7]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=1000,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8339


In [8]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
36,Name_Title_Mr.,0.124768
10,Sex_female,0.123612
9,Sex_male,0.114358
1,Name_Len,0.097597
0,Fare,0.096677
6,Pclass_3,0.045402
38,Name_Title_Miss.,0.033037
5,Ticket_Len,0.032207
28,Cabin_Letter_n,0.0304
37,Name_Title_Mrs.,0.029873


In [9]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test41.csv'), sep=",", index = False)

score: 0.79904

## RF 42

In [22]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)

In [23]:
from sklearn.preprocessing import StandardScaler
def scale(train, test, cols=['Age', 'Fare']):
    for i in cols:
        sc = StandardScaler()
        train[i] = sc.fit_transform(train[i].reshape(-1, 1))
        test[i] = sc.transform(test[i].reshape(-1, 1))
    return train, test

In [24]:
train, test = scale(train, test)

In [27]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = scale(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)

train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [28]:
np.shape(train)

(891, 51)

In [29]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [30]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [31]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8260


In [32]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,Name_Len,0.117427
0,Age,0.110091
1,Fare,0.107424
38,Name_Title_Mr.,0.097366
11,Sex_male,0.076799
12,Sex_female,0.07153
8,Pclass_3,0.04785
40,Name_Title_Miss.,0.038935
7,Ticket_Len,0.03587
30,Cabin_Letter_n,0.032825


In [33]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test42.csv'), sep=",", index = False)

score: 0.76

## RF 43

In [34]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [35]:
np.shape(train)

(891, 51)

In [36]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [37]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [38]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [39]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,Name_Len,0.129119
1,Fare,0.122111
0,Age,0.115754
38,Name_Title_Mr.,0.084287
12,Sex_female,0.06626
11,Sex_male,0.062517
7,Ticket_Len,0.04347
8,Pclass_3,0.042318
40,Name_Title_Miss.,0.034178
30,Cabin_Letter_n,0.025585


In [40]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test43.csv'), sep=",", index = False)

score: 0.77

## RF 44

In [42]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.titles_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket', 'Age'])

In [43]:
np.shape(train)

(891, 47)

In [44]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [45]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 10, 'n_estimators': 100, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [46]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=100,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8373


In [47]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
0,Fare,0.131729
1,Name_Len,0.12517
36,Name_Title_Mr.,0.110004
10,Sex_female,0.084183
9,Sex_male,0.084101
6,Pclass_3,0.047644
5,Ticket_Len,0.039481
28,Cabin_Letter_n,0.035526
37,Name_Title_Mrs.,0.031796
38,Name_Title_Miss.,0.030978


In [48]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test44.csv'), sep=",", index = False)

score: 0.76

## RF 45

In [66]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [69]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [67]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [75]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
12,Sex_female,0.111215
11,Sex_male,0.109769
33,Name_Title_Mr.,0.109746
1,Fare,0.088209
2,Name_Len,0.087904
0,Age,0.078651
8,Pclass_3,0.043268
35,Name_Title_Miss.,0.031292
7,Ticket_Len,0.031079
34,Name_Title_Mrs.,0.028852


In [57]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test45.csv'), sep=",", index = False)

score:  0.79426

## RF 46

In [60]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [61]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [62]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [63]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [64]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test46.csv'), sep=",", index = False)

score:  0.77

## RF 47

In [118]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [77]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [78]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [79]:
feats = pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False).iloc[:23, 0]

In [119]:
train = pd.concat((train.iloc[:, :2], train[feats]), axis=1)
test = pd.concat((test.iloc[:, 0], test[feats]), axis=1)

In [112]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [113]:
print(gs.best_score_)
print(gs.best_params_)

0.846240179574
{'min_samples_split': 10, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [115]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8316


In [116]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
3,Fare,0.129091
5,Age,0.11407
4,Name_Len,0.103566
2,Name_Title_Mr.,0.097082
0,Sex_female,0.096812
1,Sex_male,0.096616
6,Pclass_3,0.046606
8,Ticket_Len,0.045846
10,Cabin_Letter_n,0.032464
13,Pclass_1,0.025647


In [120]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test47.csv'), sep=",", index = False)

score:  

## RF 48

In [126]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [127]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [128]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [129]:
feats = pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False).iloc[:30, 0]

In [130]:
train = pd.concat((train.iloc[:, :2], train[feats]), axis=1)
test = pd.concat((test.iloc[:, 0], test[feats]), axis=1)

In [131]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [132]:
print(gs.best_score_)
print(gs.best_params_)

0.841750841751
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [133]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8328


In [134]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
2,Name_Title_Mr.,0.137337
5,Age,0.11189
4,Name_Len,0.110997
3,Fare,0.103998
1,Sex_male,0.092999
0,Sex_female,0.069961
6,Pclass_3,0.050742
8,Ticket_Len,0.040917
12,Fam_Size_Nuclear,0.024748
11,Fam_Size_Big,0.023612


In [135]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test48.csv'), sep=",", index = False)

score:  

## RF 49

In [136]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [137]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [138]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [140]:
feats = pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False).iloc[:10, 0]

In [141]:
train = pd.concat((train.iloc[:, :2], train[feats]), axis=1)
test = pd.concat((test.iloc[:, 0], test[feats]), axis=1)

In [142]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [144]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [145]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8373


In [148]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
3,Fare,0.179483
2,Name_Title_Mr.,0.140746
1,Sex_male,0.13494
5,Age,0.132249
4,Name_Len,0.12142
0,Sex_female,0.096419
6,Pclass_3,0.085237
8,Ticket_Len,0.068108
7,Name_Title_Miss.,0.022077
9,Name_Title_Mrs.,0.019321


In [149]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test49.csv'), sep=",", index = False)

score:  

## RF 50

In [151]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [152]:
len(train.columns)

37

In [153]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [154]:
print(gs.best_score_)
print(gs.best_params_)

0.83164983165
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [155]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8215


In [156]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
11,Sex_male,0.121257
24,Name_Title_Mr.,0.115977
1,Fare,0.098159
0,Age,0.087345
2,Name_Len,0.08623
12,Sex_female,0.077304
8,Pclass_3,0.048533
7,Ticket_Len,0.046897
25,Name_Title_Mrs.,0.03704
9,Pclass_1,0.030009


In [157]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test50.csv'), sep=",", index = False)

score:  0.80383

## RF 51

In [158]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [159]:
len(train.columns)

36

In [160]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [161]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 12, 'n_estimators': 400, 'criterion': 'gini', 'min_samples_leaf': 1}


In [162]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=400,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [163]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
1,Fare,0.130523
2,Name_Len,0.110326
0,Age,0.107603
23,Name_Title_Mr.,0.098299
10,Sex_male,0.095867
11,Sex_female,0.094893
7,Pclass_3,0.049144
15,Cabin_Letter_n,0.032542
25,Name_Title_Miss.,0.029018
24,Name_Title_Mrs.,0.028046


In [164]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test51.csv'), sep=",", index = False)

score:  

## RF 52

In [2]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [10]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
#train, test = feature_process_helper.cabin(train, test)
train['Cabin'] = train['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
test['Cabin'] = test['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [11]:
len(train.columns)

30

In [12]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [13]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [15]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [16]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
13,Sex_female,0.207785
1,Fare,0.110804
12,Sex_male,0.087572
17,Name_Title_Mr.,0.084191
3,Name_Len,0.082542
0,Age,0.080094
9,Pclass_3,0.050099
8,Ticket_Len,0.036147
27,Fam_Size_Big,0.033621
2,Cabin,0.030448


In [17]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test52.csv'), sep=",", index = False)

score:  0.77512

## RF 53 & 54

In [18]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [152]:
len(train.columns)

37

In [153]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [154]:
print(gs.best_score_)
print(gs.best_params_)

0.83164983165
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [22]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=500,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8316


In [156]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
11,Sex_male,0.121257
24,Name_Title_Mr.,0.115977
1,Fare,0.098159
0,Age,0.087345
2,Name_Len,0.08623
12,Sex_female,0.077304
8,Pclass_3,0.048533
7,Ticket_Len,0.046897
25,Name_Title_Mrs.,0.03704
9,Pclass_1,0.030009


In [23]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test54.csv'), sep=",", index = False)

score:  0.80383

## RF 55

In [24]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [25]:
train['Age'], bins = pd.qcut(train['Age'],4, retbins=True)
test['Age'] = pd.cut(test['Age'], bins=bins, include_lowest=True)
train = pd.concat((train, pd.get_dummies(train['Age'], prefix = 'Age')), axis = 1)
test = pd.concat((test, pd.get_dummies(test['Age'], prefix = 'Age')), axis = 1)
del train['Age']
del test['Age']

In [27]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train['Age'], bins = pd.qcut(train['Age'],4, retbins=True)
test['Age'] = pd.cut(test['Age'], bins=bins, include_lowest=True)
train = pd.concat((train, pd.get_dummies(train['Age'], prefix = 'Age')), axis = 1)
test = pd.concat((test, pd.get_dummies(test['Age'], prefix = 'Age')), axis = 1)
del train['Age']
del test['Age']
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [28]:
len(train.columns)

40

In [29]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [30]:
print(gs.best_score_)
print(gs.best_params_)

0.83164983165
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [31]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [32]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
14,Sex_male,0.12247
15,Sex_female,0.119988
27,Name_Title_Mr.,0.115475
0,Fare,0.104996
1,Name_Len,0.094754
11,Pclass_3,0.04928
10,Ticket_Len,0.042706
29,Name_Title_Miss.,0.038645
19,Cabin_Letter_n,0.031996
37,Fam_Size_Big,0.029974


In [33]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test55.csv'), sep=",", index = False)

score:  0.79904

## RF 55

In [34]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [54]:
train['Ticket_Num'] = train['Ticket'].apply(lambda x: x.split()[-1])
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: int(x))

In [55]:
pd.qcut(train['Ticket_Num'], 4).value_counts()

(112058, 347082]     227
(14312.5, 112058]    223
[0, 14312.5]         223
(347082, 3101317]    218
Name: Ticket_Num, dtype: int64

In [80]:
train['Survived'].groupby(pd.qcut(train['Ticket_Num'], 5)).mean()

Ticket_Num
[0, 10482]            0.318436
(10482, 28665]        0.595506
(28665, 2.38e+05]     0.511236
(2.38e+05, 349222]    0.258427
(349222, 3101317]     0.235955
Name: Survived, dtype: float64

In [71]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
for i in [train, test]:
    i['Ticket_Num'] = i['Ticket'].apply(lambda x: x.split()[-1])
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: int(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [65]:
len(train.columns)

38

In [66]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [67]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 400, 'criterion': 'gini', 'min_samples_leaf': 1}


In [68]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=400,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8361


In [69]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
8,Ticket_Num,0.110752
1,Fare,0.102201
25,Name_Title_Mr.,0.100219
2,Name_Len,0.099071
0,Age,0.092186
13,Sex_female,0.089803
12,Sex_male,0.087747
9,Pclass_3,0.037016
7,Ticket_Len,0.036582
17,Cabin_Letter_n,0.025833


In [72]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test56.csv'), sep=",", index = False)

score:  0.78469

In [77]:
corrs = {}

for i in train.columns:
    for z in train.columns:
        if i <> z and i.split('_') <> z.split('_') and i+'&'+z not in corrs and z+'&'+i not in corrs:
            if train[i].corr(train[z]) > .5:
                corrs[i+'&'+z] = train[i].corr(train[z])

## RF 56

In [34]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [54]:
train['Ticket_Num'] = train['Ticket'].apply(lambda x: x.split()[-1])
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: int(x))

In [55]:
pd.qcut(train['Ticket_Num'], 4).value_counts()

(112058, 347082]     227
(14312.5, 112058]    223
[0, 14312.5]         223
(347082, 3101317]    218
Name: Ticket_Num, dtype: int64

In [80]:
train['Survived'].groupby(pd.qcut(train['Ticket_Num'], 5)).mean()

Ticket_Num
[0, 10482]            0.318436
(10482, 28665]        0.595506
(28665, 2.38e+05]     0.511236
(2.38e+05, 349222]    0.258427
(349222, 3101317]     0.235955
Name: Survived, dtype: float64

In [81]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
for i in [train, test]:
    i['Ticket_Num'] = i['Ticket'].apply(lambda x: x.split()[-1])
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: int(x))
train['Ticket_Num'], bins = pd.qcut(train['Ticket_Num'],5, retbins=True)
test['Ticket_Num'] = pd.cut(test['Ticket_Num'], bins=bins, include_lowest=True)
train = pd.concat((train, pd.get_dummies(train['Ticket_Num'], prefix = 'Ticket_Num')), axis = 1)
test = pd.concat((test, pd.get_dummies(test['Ticket_Num'], prefix = 'Ticket_Num')), axis = 1)
del train['Ticket_Num']
del test['Ticket_Num']
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [82]:
len(train.columns)

42

In [83]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [84]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 12, 'n_estimators': 700, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [85]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=700,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8316


In [86]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
1,Fare,0.104391
29,Name_Title_Mr.,0.10421
16,Sex_male,0.095338
17,Sex_female,0.092336
0,Age,0.090017
2,Name_Len,0.085018
7,Ticket_Len,0.040202
13,Pclass_3,0.040135
21,Cabin_Letter_n,0.029172
31,Name_Title_Miss.,0.028489


In [87]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test57.csv'), sep=",", index = False)

score:  0.77512

In [77]:
corrs = {}

for i in train.columns:
    for z in train.columns:
        if i <> z and i.split('_') <> z.split('_') and i+'&'+z not in corrs and z+'&'+i not in corrs:
            if train[i].corr(train[z]) > .5:
                corrs[i+'&'+z] = train[i].corr(train[z])

## RF 58

In [34]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [54]:
train['Ticket_Num'] = train['Ticket'].apply(lambda x: x.split()[-1])
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: int(x))

In [55]:
pd.qcut(train['Ticket_Num'], 4).value_counts()

(112058, 347082]     227
(14312.5, 112058]    223
[0, 14312.5]         223
(347082, 3101317]    218
Name: Ticket_Num, dtype: int64

In [80]:
train['Survived'].groupby(pd.qcut(train['Ticket_Num'], 5)).mean()

Ticket_Num
[0, 10482]            0.318436
(10482, 28665]        0.595506
(28665, 2.38e+05]     0.511236
(2.38e+05, 349222]    0.258427
(349222, 3101317]     0.235955
Name: Survived, dtype: float64

In [89]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
#train, test = feature_process_helper.cabin(train, test)
train['Cabin'] = train['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
test['Cabin'] = test['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
for i in [train, test]:
    i['Ticket_Num'] = i['Ticket'].apply(lambda x: x.split()[-1])
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: int(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [90]:
len(train.columns)

31

In [91]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [92]:
print(gs.best_score_)
print(gs.best_params_)

0.842873176207
{'min_samples_split': 4, 'n_estimators': 400, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [93]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=400,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8418


In [94]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
9,Ticket_Num,0.130073
1,Fare,0.121581
0,Age,0.10696
3,Name_Len,0.103874
18,Name_Title_Mr.,0.087724
13,Sex_male,0.071646
14,Sex_female,0.069107
8,Ticket_Len,0.045645
10,Pclass_3,0.037004
2,Cabin,0.023797


In [95]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test58.csv'), sep=",", index = False)

score:  0.78469

In [120]:
corrs = {}

for i in train.columns:
    for z in train.columns:
        if i <> z and i.split('_') <> z.split('_') and i+'&'+z not in corrs and z+'&'+i not in corrs:
            if train[i].corr(train[z]) > .5:
                corrs[i+'&'+z] = train[i].corr(train[z])

AttributeError: 'long' object has no attribute 'split'

In [99]:
test.columns

Index([u'PassengerId', u'Age', u'Fare', u'Cabin', u'Name_Len',
       u'Age_Null_Flag', u'Cabin_num_[2, 28.667]',
       u'Cabin_num_(28.667, 65.667]', u'Cabin_num_(65.667, 148]',
       u'Ticket_Len', u'Ticket_Num', u'Pclass_3', u'Pclass_1', u'Pclass_2',
       u'Sex_male', u'Sex_female', u'Embarked_S', u'Embarked_C', u'Embarked_Q',
       u'Name_Title_Mr.', u'Name_Title_Mrs.', u'Name_Title_Miss.',
       u'Name_Title_Master.', u'Name_Title_Rev.', u'Name_Title_Dr.',
       u'Name_Title_Ms.', u'Name_Title_Col.', u'Fam_Size_Nuclear',
       u'Fam_Size_Solo', u'Fam_Size_Big'],
      dtype='object')

In [109]:
for i in test.columns[1:]:
    print i, "%.1f, %.1f, %.1f" % (test[i].mean(), train[i].mean(), (test[i].mean()-train[i].mean())/train[i].mean())

Age 29.7, 29.4, 0.0
Fare 35.6, 32.2, 0.1
Cabin 0.8, 0.8, 0.0
Name_Len 27.5, 27.0, 0.0
Age_Null_Flag 0.2, 0.2, 0.0
Cabin_num_[2, 28.667] 0.1, 0.1, -0.2
Cabin_num_(28.667, 65.667] 0.1, 0.1, 0.3
Cabin_num_(65.667, 148] 0.1, 0.1, -0.3
Ticket_Len 6.9, 6.8, 0.0
Ticket_Num 253404.4, 296989.1, -0.1
Pclass_3 0.5, 0.6, -0.1
Pclass_1 0.3, 0.2, 0.1
Pclass_2 0.2, 0.2, 0.1
Sex_male 0.6, 0.6, -0.0
Sex_female 0.4, 0.4, 0.0
Embarked_S 0.6, 0.7, -0.1
Embarked_C 0.2, 0.2, 0.3
Embarked_Q 0.1, 0.1, 0.3
Name_Title_Mr. 0.6, 0.6, -0.0
Name_Title_Mrs. 0.2, 0.1, 0.2
Name_Title_Miss. 0.2, 0.2, -0.1
Name_Title_Master. 0.1, 0.0, 0.1
Name_Title_Rev. 0.0, 0.0, -0.3
Name_Title_Dr. 0.0, 0.0, -0.7
Name_Title_Ms. 0.0, 0.0, 1.1
Name_Title_Col. 0.0, 0.0, 1.1
Fam_Size_Nuclear 0.3, 0.3, 0.1
Fam_Size_Solo 0.6, 0.6, 0.0
Fam_Size_Big 0.0, 0.1, -0.3


## RF 59

In [34]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [54]:
train['Ticket_Num'] = train['Ticket'].apply(lambda x: x.split()[-1])
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
train['Ticket_Num'] = train['Ticket_Num'].apply(lambda x: int(x))

In [55]:
pd.qcut(train['Ticket_Num'], 4).value_counts()

(112058, 347082]     227
(14312.5, 112058]    223
[0, 14312.5]         223
(347082, 3101317]    218
Name: Ticket_Num, dtype: int64

In [80]:
train['Survived'].groupby(pd.qcut(train['Ticket_Num'], 5)).mean()

Ticket_Num
[0, 10482]            0.318436
(10482, 28665]        0.595506
(28665, 2.38e+05]     0.511236
(2.38e+05, 349222]    0.258427
(349222, 3101317]     0.235955
Name: Survived, dtype: float64

In [112]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
#train, test = feature_process_helper.cabin(train, test)
train['Cabin'] = train['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
test['Cabin'] = test['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
for i in [train, test]:
    i['Ticket_Num'] = i['Ticket'].apply(lambda x: x.split()[-1])
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: int(x))
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.lda(train, test, train.iloc[:, 1], cols=['Ticket_Num', 'Fare'])
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [113]:
len(train.columns)

30

In [115]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [116]:
print(gs.best_score_)
print(gs.best_params_)

0.830527497194
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 5}


In [117]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8159


In [118]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
13,Sex_female,0.174297
8,0,0.135271
0,Age,0.088772
17,Name_Title_Mr.,0.083211
12,Sex_male,0.082767
2,Name_Len,0.076034
9,Pclass_3,0.050384
1,Cabin,0.045079
7,Ticket_Len,0.040713
19,Name_Title_Miss.,0.037849


In [119]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test59.csv'), sep=",", index = False)

score:  0.77512

In [122]:
for i in test.columns:
    print i, type(test[i][0])

PassengerId <type 'numpy.int64'>
Age <type 'numpy.float64'>
Cabin <type 'numpy.int64'>
Name_Len <type 'numpy.int64'>
Age_Null_Flag <type 'numpy.int64'>
Cabin_num_[2, 28.667] <type 'numpy.float64'>
Cabin_num_(28.667, 65.667] <type 'numpy.float64'>
Cabin_num_(65.667, 148] <type 'numpy.float64'>
Ticket_Len <type 'numpy.int64'>
0 <type 'numpy.float64'>
Pclass_3 <type 'numpy.float64'>
Pclass_1 <type 'numpy.float64'>
Pclass_2 <type 'numpy.float64'>
Sex_male <type 'numpy.float64'>
Sex_female <type 'numpy.float64'>
Embarked_S <type 'numpy.float64'>
Embarked_C <type 'numpy.float64'>
Embarked_Q <type 'numpy.float64'>
Name_Title_Mr. <type 'numpy.float64'>
Name_Title_Mrs. <type 'numpy.float64'>
Name_Title_Miss. <type 'numpy.float64'>
Name_Title_Master. <type 'numpy.float64'>
Name_Title_Rev. <type 'numpy.float64'>
Name_Title_Dr. <type 'numpy.float64'>
Name_Title_Ms. <type 'numpy.float64'>
Name_Title_Col. <type 'numpy.float64'>
Fam_Size_Nuclear <type 'numpy.float64'>
Fam_Size_Solo <type 'numpy.float

## RF 61

In [129]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
for i in [train, test]:
    i['Ticket_Num'] = i['Ticket'].apply(lambda x: x.split()[-1])
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: int(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [130]:
np.shape(train)

(891, 47)

In [131]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [132]:
print(gs.best_score_)
print(gs.best_params_)

0.846240179574
{'min_samples_split': 4, 'n_estimators': 400, 'criterion': 'gini', 'min_samples_leaf': 1}


In [133]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=400,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8451


In [134]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
3,Ticket_Num,0.130635
1,Name_Len,0.115233
0,Fare,0.105707
34,Name_Title_Mr.,0.096536
7,Sex_male,0.092064
8,Sex_female,0.08823
4,Pclass_3,0.034339
2,Ticket_Len,0.033763
36,Name_Title_Miss.,0.029376
26,Cabin_Letter_n,0.027193


In [135]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test61.csv'), sep=",", index = False)

score: 0.78469

## RF 62

In [136]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [137]:
len(train.columns)

34

In [138]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [139]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 5}


In [140]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [141]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
9,Sex_female,0.177362
21,Name_Title_Mr.,0.14072
1,Fare,0.106389
8,Sex_male,0.090796
0,Age,0.075417
2,Name_Len,0.06748
5,Pclass_3,0.061468
31,Fam_Size_Big,0.034148
6,Pclass_1,0.033386
13,Cabin_Letter_n,0.031946


In [142]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test62.csv'), sep=",", index = False)

score:  0.80383

## RF 63

In [143]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train['FAg'] = train['Fare']/train['Age']
test['FAg'] = test['Fare']/test['Age']
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [144]:
len(train.columns)

35

In [145]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [146]:
print(gs.best_score_)
print(gs.best_params_)

0.833894500561
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [147]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8260


In [148]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
9,Sex_male,0.128279
22,Name_Title_Mr.,0.125287
5,FAg,0.103436
10,Sex_female,0.10152
1,Fare,0.083556
2,Name_Len,0.073799
0,Age,0.058228
6,Pclass_3,0.04521
4,Ticket_Len,0.032584
24,Name_Title_Miss.,0.028803


In [150]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test63.csv'), sep=",", index = False)

score:  0.7751

## RF 64

In [163]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [164]:
vowels = ['a', 'e', 'i', 'o', 'u']
import string
cons = list(string.ascii_lowercase)
for i in vowels:
    cons.remove(i)

In [166]:
for i in [train, test]:
    i['Vowels'] = i['Name'].apply(lambda x: sum([vowels.count(i) for i in x.lower()]))
    i['Consonants'] = i['Name'].apply(lambda x: sum([cons.count(i) for i in x.lower()]))
    i['Vowel_Ratio'] = i['Vowels']/i['Consonants']
    del i['Vowels']
    del i['Consonants']

In [167]:
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [168]:
len(train.columns)

38

In [158]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [159]:
print(gs.best_score_)
print(gs.best_params_)

0.833894500561
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [160]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8328


In [161]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
13,Sex_female,0.108212
1,Fare,0.108173
3,Name_Len,0.098877
0,Age,0.097629
25,Name_Title_Mr.,0.085937
2,Vowel_Ratio,0.084496
12,Sex_male,0.075866
8,Ticket_Len,0.04413
9,Pclass_3,0.038257
17,Cabin_Letter_n,0.032911


In [169]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test64.csv'), sep=",", index = False)

score:  0.76

## RF 64

In [170]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [171]:
vowels = ['a', 'e', 'i', 'o', 'u']
import string
cons = list(string.ascii_lowercase)
for i in vowels:
    cons.remove(i)

In [172]:
for i in [train, test]:
    i['Vowels'] = i['Name'].apply(lambda x: sum([vowels.count(i) for i in x.split(',')[0].lower()]))
    i['Consonants'] = i['Name'].apply(lambda x: sum([cons.count(i) for i in x.split(',')[0].lower()]))
    i['Vowel_Ratio'] = i['Vowels']/i['Consonants']
    del i['Vowels']
    del i['Consonants']

In [173]:
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [174]:
len(train.columns)

38

In [177]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [178]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [179]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [180]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
1,Fare,0.115247
13,Sex_female,0.100511
25,Name_Title_Mr.,0.097403
3,Name_Len,0.090108
0,Age,0.083668
12,Sex_male,0.082144
2,Vowel_Ratio,0.063733
8,Ticket_Len,0.048427
9,Pclass_3,0.045988
17,Cabin_Letter_n,0.035968


In [181]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test65.csv'), sep=",", index = False)

score:  0.78

## logit 1

In [182]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [183]:
len(train.columns)

37

In [187]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_range = [0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'C': param_range}]

gs = GridSearchCV(estimator=LogisticRegression(random_state=0),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [188]:
print(gs.best_score_)
print(gs.best_params_)

0.829405162738
{'C': 1.0}


In [189]:
log = LogisticRegression()
log.fit(train.iloc[:, 2:], train.iloc[:, 1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [190]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = log.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test_log1.csv'), sep=",", index = False)

score:  0.80383

In [191]:
predictions['Survived'].mean()

0.41148325358851673

## RF 65

In [200]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.titles_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [201]:
len(train.columns)

35

In [202]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [203]:
print(gs.best_score_)
print(gs.best_params_)

0.830527497194
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [204]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8328


In [205]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
12,Sex_female,0.124569
11,Sex_male,0.121315
24,Name_Title_Mr.,0.109422
1,Fare,0.109218
2,Name_Len,0.093007
0,Age,0.075889
8,Pclass_3,0.044035
32,Fam_Size_Big,0.039709
7,Ticket_Len,0.03967
16,Cabin_Letter_n,0.0253


In [206]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])

In [197]:
scores = rf.predict_proba(test.iloc[:,1:])[:, 0]

In [207]:

predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test65.csv'), sep=",", index = False)

score:  NOPE

## RF 66

In [240]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [225]:
len(train.columns)

36

In [243]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1],
             "min_samples_split" : [16, 18, 20],
             "n_estimators": [40, 50, 60]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [244]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 16, 'n_estimators': 40, 'criterion': 'gini', 'min_samples_leaf': 1}


In [245]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=40,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8204


In [227]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
11,Sex_male,0.146621
12,Sex_female,0.120459
1,Fare,0.112045
0,Age,0.106245
2,Name_Len,0.100961
7,Ticket_Len,0.049314
8,Pclass_3,0.04682
24,Name_Title_Mrs.,0.035805
25,Name_Title_Miss.,0.033666
9,Pclass_1,0.033326


In [246]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test67.csv'), sep=",", index = False)

score:  0.80383

In [247]:
ys = []
for i in os.listdir('submission_files'):
    ys.append(i)

In [249]:
for i in ys:
    if len(i)<12:
        ys.remove(i)

In [253]:
ys = ys[28:]

In [264]:
y_test = test.iloc[:,0]

In [265]:
for i in ys:
    y_test = pd.concat((y_test, pd.read_csv(os.path.join('submission_files', i), usecols=[1])), axis=1)

In [266]:
y_test

Unnamed: 0,PassengerId,Survived,Survived.1,Survived.2,Survived.3,Survived.4,Survived.5,Survived.6,Survived.7,Survived.8,...,Survived.9,Survived.10,Survived.11,Survived.12,Survived.13,Survived.14,Survived.15,Survived.16,Survived.17,Survived.18
0,892,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,893,1,1,1,1,0,0,1,0,0,...,1,1,0,0,0,0,0,1,1,0
2,894,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,895,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,896,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,897,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,898,1,1,1,1,0,0,1,0,0,...,1,1,0,0,1,1,1,1,0,1
7,899,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,900,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9,901,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [279]:
y_test33 = pd.concat((test.iloc[:, 0], y_test.iloc[:, 1:].mode(axis=1)[0]), axis=1)

In [280]:
y_test33[0] = y_test33[0].apply(lambda x: int(x))

In [281]:
y_test33 = y_test33.rename(index=str, columns={0: "Survived"})

In [282]:
y_test33.to_csv(os.path.join('submission_files', 'y_test68.csv'), sep=",", index = False)

In [283]:
y_test33

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


score: .794

## RF 69

In [2]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
for i in [train, test]:
    i['Ticket_Num'] = i['Ticket'].apply(lambda x: x.split()[-1])
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: x if x[-1] in ['0','1','2','3','4','5','6','7','8','9'] else 0)
    i['Ticket_Num'] = i['Ticket_Num'].apply(lambda x: int(x))
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.lda(train, test, train.iloc[:, 1], cols=['Ticket_Num', 'Fare', 'Age'])
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [3]:
len(train.columns)

36

In [4]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [154]:
print(gs.best_score_)
print(gs.best_params_)

0.83164983165
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [155]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8215


In [156]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
11,Sex_male,0.121257
24,Name_Title_Mr.,0.115977
1,Fare,0.098159
0,Age,0.087345
2,Name_Len,0.08623
12,Sex_female,0.077304
8,Pclass_3,0.048533
7,Ticket_Len,0.046897
25,Name_Title_Mrs.,0.03704
9,Pclass_1,0.030009


In [157]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test50.csv'), sep=",", index = False)

score:  0.80383