In [None]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sklearn
from sklearn import tree
import sklearn.preprocessing as preprocessing
from sklearn.ensemble import BaggingClassifier,BaggingRegressor
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.ensemble import voting_classifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split,learning_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve


## data preparation

In [None]:
# load the data
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
# set sort=False because test data don't have survived data
combine = pd.concat([train,test],sort=False)

### feature engineering

In [None]:
# Name length
combine['Name_len'] = combine['Name'].apply(lambda x:len(x))
combine['Name_len'] = pd.qcut(combine['Name_len'],5)

In [None]:
combine['Title'] = combine['Name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('.')[0])
combine.Title.unique()

In [None]:
# Title from Name
combine['Title'] = combine['Name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('.')[0])
combine['Title'] = combine['Title'].replace(['Master','Major','Col','Sir','Dr', 'Col'],'Royalty')
combine['Title'] = combine['Title'].replace(['Rev','Don','Capt','Jonkheer'],'Goodman')
combine['Title'] = combine['Title'].replace(['Mlle','Ms','Dona'], 'Miss')
combine['Title'] = combine['Title'].replace(['the Countess','Mme','Lady'], 'Mrs')
df = pd.get_dummies(combine['Title'],prefix='Title')
combine = pd.concat([combine,df],axis=1)

In [None]:
# Family and family name
combine['Fname'] = combine['Name'].apply(lambda x:x.split(',')[0])
combine['Familysize'] = combine['SibSp'] + combine['Parch']
dead_female_Fname = list(set(combine[(combine.Sex=='female') & (combine.Age>=12)
                              & (combine.Survived==0) & (combine.Familysize>1)]['Fname'].values))
survive_male_Fname = list(set(combine[(combine.Sex=='male') & (combine.Age>=12)
                              & (combine.Survived==1) & (combine.Familysize>1)]['Fname'].values))
combine['Dead_female_family'] = np.where(combine['Fname'].isin(dead_female_Fname),1,0)
combine['Survive_male_family'] = np.where(combine['Fname'].isin(survive_male_Fname),1,0)
combine = combine.drop(['Name','Fname'],axis=1)

In [None]:
# Age
# discretize age, add new feature 'IsChild'
group = combine.groupby(['Title', 'Pclass'])['Age']
combine['Age'] = group.transform(lambda x: x.fillna(x.median()))
combine = combine.drop('Title',axis=1)
combine['IsChild'] = np.where(combine['Age']<=12,1,0)
combine['Age'] = pd.cut(combine['Age'],5)
# combine = combine.drop('Age',axis=1)


In [None]:
# one-hot encoding 'Familysize'
combine['Familysize'] = np.where(combine['Familysize']==0, 'solo',
                                    np.where(combine['Familysize']<=3, 'normal', 'big'))
df = pd.get_dummies(combine['Familysize'],prefix='Familysize')
combine = pd.concat([combine,df],axis=1).drop(['SibSp','Parch','Familysize'],axis=1)

In [None]:
# Cabin
combine['Cabin_isNull'] = np.where(combine['Cabin'].isnull(),0,1)
combine = combine.drop('Cabin',axis=1)

In [None]:
# Embarked, fill NaN with S
# because there is only two NaN, it will not help a lot to set NaN as a category 
combine.Embarked = combine.Embarked.fillna('C')
df = pd.get_dummies(combine['Embarked'],prefix='Embarked')
combine = pd.concat([combine,df],axis=1).drop('Embarked',axis=1)

In [None]:
# Pclass, one hot encoding
df = pd.get_dummies(combine['Pclass'],prefix='Pclass')
combine = pd.concat([combine,df],axis=1).drop('Pclass',axis=1)

In [None]:
# Sex
df = pd.get_dummies(combine['Sex'],prefix='Sex')
combine = pd.concat([combine,df],axis=1).drop('Sex',axis=1)

In [None]:
# Fare
combine['Fare'] = pd.qcut(combine.Fare,3)
df = pd.get_dummies(combine.Fare,prefix='Fare')
combine = pd.concat([combine,df],axis=1).drop('Fare',axis=1)

In [None]:
# ticket
combine['Ticket_Lett'] = combine['Ticket'].apply(lambda x: str(x)[0])
combine['Ticket_Lett'] = combine['Ticket_Lett'].apply(lambda x: str(x))

combine['High_Survival_Ticket'] = np.where(combine['Ticket_Lett'].isin(['1', '2', 'P']),1,0)
combine['Low_Survival_Ticket'] = np.where(combine['Ticket_Lett'].isin(['A','W','3','7']),1,0)
combine = combine.drop(['Ticket','Ticket_Lett'],axis=1)

In [None]:
features = combine.drop(["PassengerId","Survived"], axis=1).columns
le = preprocessing.LabelEncoder()
for feature in features:
    le = le.fit(combine[feature])
    combine[feature] = le.transform(combine[feature])

In [None]:
combine.columns

In [None]:
combine.rename(columns={
    "Fare_(-0.001, 8.662]":"Fare_0",
    "Fare_(8.662, 26.0]":"Fare_1",
    "Fare_(26.0, 512.329]":"Fare_2"
},inplace=True)

In [None]:
combine.columns

In [None]:
X_all = combine.iloc[:891,:].drop(["PassengerId","Survived"], axis=1)
Y_all = combine.iloc[:891,:]["Survived"]
X_test = combine.iloc[891:,:].drop(["PassengerId","Survived"], axis=1)

## build model and train

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier(n_neighbors = 3)
dt = DecisionTreeClassifier()
rf0 = RandomForestClassifier(n_estimators=300,min_samples_leaf=4,class_weight={0:0.745,1:0.255})
rf = RandomForestClassifier(n_estimators = 750, criterion = 'gini', max_features = 'sqrt',
                                             max_depth = 3, min_samples_split = 4, min_samples_leaf = 2,
                                             n_jobs = 50, random_state = 42, verbose = 1)
gbdt = GradientBoostingClassifier(n_estimators=500,learning_rate=0.03,max_depth=3)
gbm_est = GradientBoostingClassifier(n_estimators=900, learning_rate=0.0008, loss='exponential',
                                                  min_samples_split=3, min_samples_leaf=2, max_features='sqrt',
                                                  max_depth=3, random_state=42, verbose=1)
xgbGBDT = XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05)
clfs = [lr, svc, knn, dt, rf0, rf, gbdt, gbm_est, xgbGBDT]

In [None]:

clf = XGBClassifier(learning_rate=0.1,max_depth=2,silent=True,objective='binary:logistic')
param_test = {
    'n_estimators':[50,100,300,700],
    'max_depth':[1,2,3,5]
}
grid_search = GridSearchCV(estimator=clf,param_grid=param_test,scoring='accuracy',cv=50)
grid_search.fit(X_all,Y_all)
grid_search.grid_scores_, grid_search.best_params_, grid_search.best_score_


## prediction

In [None]:
voting = VotingClassifier(estimators = [('lr',lr),('svc', svc),('rf', rf),('gbm_est',gbm_est),('xgbGBDT',xgbGBDT)],
                                       voting = 'hard',n_jobs = -1)
voting.fit(X_all,Y_all)
predictions = voting.predict(X_test)

In [None]:
submission = pd.DataFrame({
    'PassengerId': test["PassengerId"],
    'Survived': predictions.astype(np.int32)
})
submission.to_csv('./v4.csv',mode='w+',index=False)