In [50]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV

In [2]:
path = "F:/for learn/Python_ML_and_Kaggle/Datasets/Titanic/"

In [3]:
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

In [11]:
# 0. Data Preparation
## 0.1 initial features 
selected_features = ["Sex", "Pclass", "Age", "Embarked", "SibSp", "Parch", "Fare"]
x_train = train[selected_features]
x_test = test[selected_features]
y_train = train["Survived"]

In [30]:
## 0.2 NA fill
def fillNA(df, na_features, fill_values):
    zipList = list(zip(na_features, fill_values))
    for feature, value in zipList:
        df[feature].fillna(value, inplace=True)
    return df

fillna_Embarked = x_train["Embarked"].value_counts().argmax()
fillna_Ages = x_train["Age"].mean()
fillna_Fare = x_train["Fare"].mean()
na_features = ["Embarked", "Age", "Fare"]
fill_values = [fillna_Embarked, fillna_Ages, fillna_Fare]

x_train = fillNA(x_train, na_features, fill_values)
x_test = fillNA(x_test, na_features, fill_values)

In [37]:
## 0.3 feature ETL
vec = DictVectorizer(sparse=False)
x_train = vec.fit_transform(x_train.to_dict(orient="record"))
x_test = vec.transform(x_test.to_dict(orient="record"))

In [49]:
# 1. Train
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)

xgb = XGBClassifier()
xgb.fit(x_train, y_train)
xgb_y_predict = xgb.predict(x_test)

#cross_val_score(rfc, x_train, y_train, cv=5).mean()
#cross_val_score(xgb, x_train, y_train, cv=5).mean()
## 1.1 Single Model with default params
def subMission(y_predict, fileName):
    df = pd.DataFrame({"PassengerID": test["PassengerId"], "Survived": y_predict})
    df.to_csv(fileName, index=False)
subMission(rfc_y_predict, path + "rfc_sub.csv")
subMission(xgb_y_predict, path + "xgb_sub.csv")

In [55]:
params = {'max_depth': [2, 3, 4, 5, 6], 
          'n_estimators': [100, 300, 500, 700, 900], 
          'learning_rate': [0.05, 0.1, 0.25, 0.5 ,1.0]
         }
xgb_best = XGBClassifier()
gs = GridSearchCV(estimator=xgb_best, param_grid=params, n_jobs=-1, cv=5, verbose=1)
gs.fit(x_train, y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:  3.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 6], 'n_estimators': [100, 300, 500, 700, 900], 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [56]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [58]:
xgb_best_y_predict = gs.predict(x_test)
subMission(xgb_best_y_predict, path + "xgb_best_sub.csv")