In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn import metrics

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from xgboost.sklearn import XGBClassifier

In [None]:
titanic = pd.read_csv('data/titanic_train.csv')

In [None]:
titanic.head()

In [None]:
titanic.shape

In [None]:
titanic.info()

In [None]:
titanic.isnull().sum()

# Features

In [None]:
# Fill Embarked
titanic["Embarked"] = titanic["Embarked"].fillna("S")

In [None]:
# Change Value Sex
titanic["Sex"] = titanic["Sex"].map({"male": 0, "female":1})

In [None]:
# Fill Age
index_NaN_age = list(titanic["Age"][titanic["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = titanic["Age"].median()
    age_pred = titanic["Age"][((titanic['SibSp'] == titanic.iloc[i]["SibSp"]) & (titanic['Parch'] == titanic.iloc[i]["Parch"]) & (titanic['Pclass'] == titanic.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        titanic['Age'].iloc[i] = age_pred
    else :
        titanic['Age'].iloc[i] = age_med

In [None]:
# Make Title
titanic_title = [i.split(",")[1].split(".")[0].strip() for i in titanic["Name"]]
titanic["Title"] = pd.Series(titanic_title)
titanic["Title"].head()

In [None]:
# Convert Title to categorical values 
titanic["Title"] = titanic["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic["Title"] = titanic["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
titanic["Title"] = titanic["Title"].astype(int)

In [None]:
# Drop Name
titanic.drop(labels = ["Name"], axis = 1, inplace = True)

In [None]:
# Make Family Size
titanic["Fsize"] = titanic["SibSp"] + titanic["Parch"] + 1

In [None]:
# Make Single, Small Family, Medium Family, Large Family Columns
titanic['Single'] = titanic['Fsize'].map(lambda s: 1 if s == 1 else 0)
titanic['SmallF'] = titanic['Fsize'].map(lambda s: 1 if  s == 2  else 0)
titanic['MedF'] = titanic['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
titanic['LargeF'] = titanic['Fsize'].map(lambda s: 1 if s >= 5 else 0)

In [None]:
# One Hot Encoder Title & Embarked
titanic = pd.get_dummies(titanic, columns = ["Title"])
titanic = pd.get_dummies(titanic, columns = ["Embarked"], prefix="Em")

In [None]:
# Fill Cabin
titanic["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in titanic['Cabin'] ])

In [None]:
# One Hot Encoder Cabin
titanic = pd.get_dummies(titanic, columns = ["Cabin"],prefix="Cabin")

In [None]:
# Fill and Convert Ticket
Ticket = []
for i in list(titanic.Ticket):
    if not i.isdigit() :
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
    else:
        Ticket.append("X")
        
titanic["Ticket"] = Ticket
titanic["Ticket"].head()

In [None]:
# One Hot Encoder Ticket
titanic = pd.get_dummies(titanic, columns = ["Ticket"], prefix="T")

In [None]:
# One Hot Encoder Pclass
titanic["Pclass"] = titanic["Pclass"].astype("category")
titanic = pd.get_dummies(titanic, columns = ["Pclass"],prefix="Pc")

In [None]:
# Drop PassengerId
titanic.drop(labels = ["PassengerId"], axis = 1, inplace = True)

In [None]:
titanic.head()

### Train & Test Split

In [None]:
X = titanic.drop(labels = ["Survived"],axis = 1)
y = titanic["Survived"]

# Normalize
X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

kfold = StratifiedKFold(n_splits=10)

### Modeling Single vs Bagging

In [None]:
num_trees = 100

#Logreg
LR = LogisticRegression(solver='lbfgs',max_iter = 4000).fit(X_train,y_train)
results = cross_val_score(LR, X_train,y_train, cv=kfold)
print ("Logistic Regression (stand alone) - Train : ", results.mean())
print ("Logistic Regression (stand alone) - Test : ", metrics.accuracy_score(LR.predict(X_test), y_test))

bag_LR = BaggingClassifier(base_estimator=LR, n_estimators=num_trees).fit(X_train,y_train)
results = cross_val_score(bag_LR, X_train, y_train, cv=kfold)
print ("\nLogistic Regression (Bagging) - Train : ", results.mean())
print ("Logistic Regression (Bagging) - Test : ", metrics.accuracy_score(bag_LR.predict(X_test), y_test))

#KNN
KNN = KNeighborsClassifier().fit(X_train,y_train)
results = cross_val_score(KNN, X_train,y_train, cv=kfold)
print ("\nKNN (stand alone) - Train : ", results.mean())
print ("KNN (stand alone) - Test : ", metrics.accuracy_score(KNN.predict(X_test), y_test))

bag_KNN = BaggingClassifier(base_estimator=KNN, n_estimators=num_trees).fit(X_train,y_train)
results = cross_val_score(bag_KNN, X_train, y_train, cv=kfold)
print ("\nKNN (Bagging) - Train : ", results.mean())
print ("KNN (Bagging) - Test : ", metrics.accuracy_score(bag_KNN.predict(X_test), y_test))

#SVM
SVM = SVC(gamma='scale').fit(X_train,y_train)
results = cross_val_score(SVM, X_train,y_train, cv=kfold)
print ("\nSVM (stand alone) - Train : ", results.mean())
print ("SVM (stand alone) - Test : ", metrics.accuracy_score(SVM.predict(X_test), y_test))

bag_SVM = BaggingClassifier(base_estimator=SVM, n_estimators=num_trees).fit(X_train,y_train)
results = cross_val_score(bag_SVM, X_train, y_train, cv=kfold)
print ("\nSVM (Bagging) - Train : ", results.mean())
print ("SVM (Bagging) - Test : ", metrics.accuracy_score(bag_SVM.predict(X_test), y_test))

#Decision Tree
DT = DecisionTreeClassifier().fit(X_train,y_train)
results = cross_val_score(DT, X_train,y_train, cv=kfold)
print ("\nDecision Tree (stand alone) - Train : ", results.mean())
print ("Decision Tree (stand alone) - Test : ", metrics.accuracy_score(DT.predict(X_test), y_test))

bag_DT = BaggingClassifier(base_estimator=DT, n_estimators=num_trees).fit(X_train,y_train)
results = cross_val_score(bag_DT, X_train, y_train, cv=kfold)
print ("\nDecision Tree (Bagging) - Train : ", results.mean())
print ("Decision Tree (Bagging) - Test : ", metrics.accuracy_score(bag_DT.predict(X_test), y_test))

### Hyperparameter Boosting

#### Decision Tree - AdaBoost

In [None]:
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)
ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsadaDTC.fit(X_train,y_train)

ada_best = gsadaDTC.best_estimator_

# Best score
gsadaDTC.best_score_

#### Random Forest Parameter Tuning

In [None]:
RFC = RandomForestClassifier()

## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsRFC.fit(X_train,y_train)

RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

#### ExtraTrees 

In [None]:
ExtC = ExtraTreesClassifier()

## Search grid for optimal parameters
ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsExtC.fit(X_train,y_train)

ExtC_best = gsExtC.best_estimator_

# Best score
gsExtC.best_score_

#### Gradient Boosting Tuning

In [None]:
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsGBC.fit(X_train,y_train)

GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

#### SVM Tuning

In [None]:
### SVC classifier
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300, 1000]}

gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsSVMC.fit(X_train,y_train)

SVMC_best = gsSVMC.best_estimator_

# Best score
gsSVMC.best_score_

### Plot Learning Curve

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

g = plot_learning_curve(gsadaDTC.best_estimator_,"AdaBoost learning curves",X_train,y_train,cv=kfold)
g = plot_learning_curve(gsRFC.best_estimator_,"RF mearning curves",X_train,y_train,cv=kfold)
g = plot_learning_curve(gsExtC.best_estimator_,"Extra Trees learning curves",X_train,y_train,cv=kfold)
g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",X_train,y_train,cv=kfold)
g = plot_learning_curve(gsSVMC.best_estimator_,"SVC learning curves",X_train,y_train,cv=kfold)

### Feature Important of Tree Based

In [None]:
nrows = 2
ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))

names_classifiers = [("AdaBoosting", ada_best),("RandomForest",RFC_best),("ExtraTrees",ExtC_best),("GradientBoosting",GBC_best)]

nclassifier = 0
for row in range(nrows):
    for col in range(ncols):
        name = names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices = np.argsort(classifier.feature_importances_)[::-1][:40]
        g = sns.barplot(y=X_train.columns[indices][:40],x = classifier.feature_importances_[indices][:40] , orient='h',ax=axes[row][col])
        g.set_xlabel("Relative importance",fontsize=12)
        g.set_ylabel("Features",fontsize=12)
        g.tick_params(labelsize=9)
        g.set_title(name + " feature importance")
        nclassifier += 1

### Voting

In [None]:
clfs = []
VoteH = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best), ('svc', SVMC_best), ('adac',ada_best),('gbc',GBC_best)], voting='hard', n_jobs=4)
VoteS = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best), ('svc', SVMC_best), ('adac',ada_best),('gbc',GBC_best)], voting='soft', n_jobs=4)

for clf, label in zip([VoteH, VoteS], ['Ensemble Hard Voting', 'Ensemble Soft Voting']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)    
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

### Stacking

In [None]:
num_trees = 100
verbose = True # to print the progress

clfs = [RFC_best,ExtC_best,SVMC_best,ada_best,GBC_best]

# Creating train and test sets for blending
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))

print('5-fold cross validation:\n')
for i, clf in enumerate(clfs):   
    scores = cross_val_score(clf, X_train, y_train, cv=kfold, scoring='accuracy')
    print("##### Base Model %0.0f #####" % i)
    print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    clf.fit(X_train, y_train)   
    print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_train), y_train)))
    dataset_blend_train[:,i] = clf.predict_proba(X_train)[:, 1]
    dataset_blend_test[:,i] = clf.predict_proba(X_test)[:, 1]
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))    

print ("##### Meta Model #####")
clf = LogisticRegression()
scores = cross_val_score(clf, dataset_blend_train, y_train, cv=kfold, scoring='accuracy')
clf.fit(dataset_blend_train, y_train)
print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_train), y_train)))
print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_test), y_test)))