# <center> Random Forest Classifier
### Import Preliminaries

In [2]:
%matplotlib inline

# Import modules
import pandas as pd
import numpy as np

# Import data
train_df = pd.DataFrame(pd.read_csv('https://www.dropbox.com/s/3zeaj89zl0e607x/featured_train_df2.csv?dl=1',index_col='PassengerId'))
test_df = pd.DataFrame(pd.read_csv('https://www.dropbox.com/s/gr0rboxonowwh91/featured_test_df2.csv?dl=1', index_col='PassengerId'))

# Set pandas options
pd.set_option('precision',8)
pd.set_option('max_columns',100)
pd.set_option('max_rows',100)

dfs = [train_df, test_df]

### DataFrame to Values

In [3]:
# Convert pandas dataframe into prediction values and data
y = train_df.Survived.values
X = train_df[train_df.columns[1:]].values

X_test = test_df.values

### Setup Model

In [4]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

### Greedy Search Setup

In [5]:
from sklearn.grid_search import GridSearchCV

# Create the grid
grid = {'n_estimators':list(range(1,10)),
       'max_depth': list(range(1,10)),
       'min_samples_leaf':list(range(1,50))}

models = GridSearchCV(estimator=model, 
                            param_grid=grid,
                           scoring='accuracy',n_jobs=1,
                           refit=True, cv=10)

### Fit Models

In [6]:
models.fit(X,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

### Greedy Results

In [7]:
print('Accuracy of best parameters: %.3f'%models.best_score_)
print('Best parameters: %s' %models.best_params_)

Accuracy of best parameters: 0.819
Best parameters: {'max_depth': 9, 'min_samples_leaf': 1, 'n_estimators': 6}


### Tuned Model

In [8]:
classifier = RandomForestClassifier(max_depth=15, 
                                    min_samples_leaf=2,
                                    n_estimators=54)
classifier.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=54, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Model Prediction

In [9]:
# create predictions dataframe
prediction = pd.DataFrame(test_df.index)
survived = pd.DataFrame(classifier.predict(X_test),
                          columns=['Survived'])
prediction = pd.concat([prediction, survived],axis=1)
prediction

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


### K-Fold Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

crossvalidation = KFold(20, random_state=1)
scores = cross_val_score(classifier, X, y, 
                scoring= 'neg_mean_squared_error',
                cv = crossvalidation, n_jobs =1)

print ('Folds: %i, mean squared error: %.2f std: %.2f' 
% (len(scores),np.mean(np.abs(scores)),np.std(scores)))

Folds: 20, mean squared error: 0.20 std: 0.05


### Export Results

In [11]:
prediction.to_csv('Submissions/titantic_predictions_rfc_tuned.csv',index=False)

In [12]:
prediction.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


### Accuracy Results

<br> Feature Engineering 1 - Score: 0.66985
<br> Feature Engineering 2 - Score: 
<br> Feature Engineering 3 - Score: 

### Kaggle Results

<br> Feature Engineering 1 - Score: 0.66985
<br> Feature Engineering 2 - Score: 
<br> Feature Engineering 3 - Score: 