In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sn
import xgboost as xgb
from statistics import mode
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


In [None]:
# load data set
df_train = pd.read_csv('../input/titanic/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('../input/titanic/test.csv')
df_test.head()

# Data Cleansing

In [None]:
# check missing value in df_train
df_train.isna().sum()

In [None]:
# replace NA in Age with the median
df_train.Age = df_train.Age.fillna(df_train.Age.median())

# replace NA in Cabin with 0
df_train.Cabin = df_train.Cabin.fillna(0)

# replace NA in Embarked with the mode
df_train.Embarked = df_train.Embarked.fillna(mode(df_train.Embarked))

In [None]:
# check missing value again
df_train.isna().sum()

In [None]:
# check missing value in df_test
df_test.isna().sum()

In [None]:
# replace NA in Age with the median
df_test.Age = df_test.Age.fillna(df_test.Age.median())

# replace NA in Cabin with 0
df_test.Cabin = df_test.Cabin.fillna(0)

# replace NA in Fare with the mean
df_test.Fare = df_test.Fare.fillna(np.mean(df_test.Fare))

In [None]:
# check missing value in df_test agian
df_test.isna().sum()

# Feature Engineering

Now, the task is generate as many features as possible. here is my plan:<br>
[1] Name. generate two features: name length and title, and title will be transform into dummy variables<br>
[2] SibSp and Parch. For me, they are very similar, so I can create a new feature called family which equals SibSp + Parch, or I can set several bins for the two columns (0, 1, or 2+)<br>
[3] Ticket. generate two features: ticket length and letter (the ticket number contains letter or not)<br>
[4] Cabin. two features: Cabin letter (dummy variable) and Cabin number<br>
[5] Fare. based on Cabin number, generate ave_fare = Fare / Cabin number <br>
[6] Embarked. Transform into dummy variables <br>

In [None]:
# set a class to generate features
class data(object):
    def __init__(self,dataset):
        self.dataset = dataset.copy()
    def new(self):
        self.dataset['Sex'] = self.dataset.Sex.replace("female", 0)
        self.dataset['Sex'] = self.dataset.Sex.replace("male", 1)
        self.dataset['name_length'] = self.dataset.Name.str.len()
        self.dataset['title'] = self.dataset.Name.str.extract(pat='([a-zA-Z]+\.)')
        self.dataset['family'] = self.dataset.SibSp +self.dataset.Parch
        self.dataset['ticket_length'] = self.dataset.Ticket.str.len()
        self.dataset['ticket_letter'] = self.dataset.Ticket.apply(lambda x : 1 if bool(re.search('[A-Za-z]+',x)) else 0)
        self.dataset['cabin_letter'] = self.dataset.Cabin.str.replace('[0-9]+','').str.replace(' ','')
        self.dataset['cabin_number'] = self.dataset.cabin_letter.str.len()
        self.dataset['cabin_number'] = self.dataset.cabin_number.apply(lambda x: 1 if x != x else x)
        self.dataset['ave_fare'] = self.dataset.Fare / self.dataset.cabin_number
        self.dataset = self.dataset.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'])
        return self.dataset


In [None]:
new_train = data(df_train).new()
new_train

EDA

In [None]:
# violinplot of Pclass
sn.violinplot(x="Survived", y="Pclass", data=new_train, size=6)

In [None]:
# violinplot of Sex
sn.violinplot(x="Survived", y="Sex", data=new_train, size=6)

In [None]:
# kdeplot of Age
sn.FacetGrid(new_train, hue="Survived", height=6) \
   .map(sn.kdeplot, "Age") \
   .add_legend()

# I plan to create severals bins for Age: 0-18, 18-30, 30-60, 60+

In [None]:
# SibSp, Parch, and  family
sn.jointplot(x="SibSp", y="Parch", data=new_train, height=5)
plt.show()

sn.FacetGrid(new_train, hue="Survived", size=6) \
   .map(sn.kdeplot, "family") \
   .add_legend()

# bins for family: 0, 1-3, 4+

In [None]:
# kdeplot of Fare, ave_fare
sn.histplot(new_train, x ='Fare')
plt.show()

sn.histplot(new_train, x = 'ave_fare')
plt.show()

sn.FacetGrid(new_train, hue="Survived", height=6) \
   .map(sn.kdeplot, "Fare") \
   .add_legend()
plt.show()

sn.FacetGrid(new_train, hue="Survived", height=6) \
   .map(sn.kdeplot, "ave_fare") \
   .add_legend()
plt.show()

# higher fare can bring higher survivied possibility

In [None]:
sn.violinplot(x="Embarked", y="Survived", data=new_train, height=6)

In [None]:
sn.violinplot(x = 'Survived', y = 'title', data = new_train, height = 6)

# five groups: Mr., Mrs., Miss., Master., other

In [None]:
sn.violinplot(x = 'Survived', y = 'name_length', data=new_train, height=6)

In [None]:
sn.violinplot(x = 'Survived', y = 'ticket_letter', data=new_train, height=6)

#there is no siginificant difference, so the feature is not good

In [None]:
sn.violinplot(x = 'Survived', y = 'ticket_length', data=new_train, height=6)

#there is no siginificant difference, so the feature is not good

In [None]:
sn.violinplot(x = 'Survived', y = 'cabin_letter', data = new_train, height=6)

# people in B C D E F cabin has a higher survived rate,in A ,G and other are not

**Conclusion:**<br>
[1] Pclass:keep<br>
[2] Sex: keep<br>
[3] Age: bins(0-18,18-30,30-60,60+)<br>
[4] SibSp: remove<br>
[5] parch: remove<br>
[6] family: bins(0,1-3,4+)<br>
[7] Fare, ave_fare: keep one<br>
[8] Embarked: keep, dummy<br>
[9] ttile: bins(Mr., Mrs., Miss., Master., other)<br>
[10] name_length: keep<br>
[11] ticket (letter and length): remove<br>
[12] Cabin_letter: bins (BCDEF, AG, other)

In [None]:
# optimize the class to generate features 1
class data2(object):
    def __init__(self,dataset):
        self.dataset = dataset.copy()
    def new(self):
        self.dataset['Sex'] = self.dataset.Sex.replace("female", 0)
        self.dataset['Sex'] = self.dataset.Sex.replace("male", 1)
        self.dataset['Age'] = pd.cut(self.dataset['Age'], [0,18,30,60,100], labels = ['bin1', 'bin2', 'bin3', 'bin4'])
        self.dataset['family'] = self.dataset.SibSp +self.dataset.Parch
        self.dataset['family'] = pd.cut(self.dataset['family'], [-0.5,0.5,3.5,np.Inf], labels = ['b1','b2','b3'])
        self.dataset['title'] = self.dataset.Name.str.extract(pat='([a-zA-Z]+\.)')
        for titl in ['Master.','Mrs.','Miss.','Mr.']:
            self.dataset[titl] = self.dataset.title.apply(lambda x: int(titl in x if isinstance(x, str) else False))
        self.dataset['title_Other'] = self.dataset.loc[:,['Master.','Mrs.','Miss.','Mr.']].sum(1)
        self.dataset['title_Other'] = self.dataset.title_Other.apply(lambda x : 1 if x == 0 else 0)
        self.dataset['name_length'] = self.dataset.Name.str.len()
        self.dataset['cabin_letter'] = self.dataset.Cabin.str.replace('[0-9]+','').str.replace(' ','')
        self.dataset['cabin_letter'] = self.dataset.cabin_letter.apply(lambda x : 'other' if not(isinstance(x, str)) else \
                                                                       'b1' if any(['A'in x, 'G' in x]) else \
                                                                      'b2' if any(['B' in x, 'C' in x, 'D' in x, 'E' in x, 'F' in x]) else 'other')
        self.dataset = pd.get_dummies(self.dataset, columns = ['Embarked', 'Pclass', 'Age', 'family', 'cabin_letter'])
        self.dataset = self.dataset.drop(columns = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin','title'])
        return self.dataset

In [None]:
# optimize the class to generate features 2
class data3(object):
    def __init__(self,dataset):
        self.dataset = dataset.copy()
    def new(self):
        self.dataset['Sex'] = self.dataset.Sex.replace("female", 0)
        self.dataset['Sex'] = self.dataset.Sex.replace("male", 1)
        self.dataset['Age'] = self.dataset.Age - self.dataset.Age.mean()
        self.dataset['Fare'] = self.dataset.Fare - self.dataset.Fare.mean()
        self.dataset['family'] = self.dataset.SibSp +self.dataset.Parch
        self.dataset['family'] = pd.cut(self.dataset['family'], [-0.5,0.5,3.5,np.Inf], labels = ['b1','b2','b3'])
        self.dataset['title'] = self.dataset.Name.str.extract(pat='([a-zA-Z]+\.)')
        for titl in ['Mrs.','Miss.','Mr.']:
            self.dataset[titl] = self.dataset.title.apply(lambda x: int(titl in x if isinstance(x, str) else False))
        self.dataset['title_Other'] = self.dataset.loc[:,['Mrs.','Miss.','Mr.']].sum(1)
        self.dataset['title_Other'] = self.dataset.title_Other.apply(lambda x : 1 if x == 0 else 0)
        self.dataset['name_length'] = self.dataset.Name.str.len()
        self.dataset['cabin_letter'] = self.dataset.Cabin.str.replace('[0-9]+','').str.replace(' ','')
        self.dataset['cabin_letter'] = self.dataset.cabin_letter.apply(lambda x : 'other' if not(isinstance(x, str)) else \
                                                                       'b1' if any(['A'in x, 'G' in x]) else \
                                                                      'b2' if any(['B' in x, 'C' in x, 'D' in x, 'E' in x, 'F' in x]) else 'other')
        self.dataset = pd.get_dummies(self.dataset, columns = ['Embarked', 'Pclass', 'Age', 'family', 'cabin_letter'])
        self.dataset = self.dataset.drop(columns = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin','title'])
        return self.dataset

# Model Selection

In [None]:
class model_selection:
    def __init__(self, cv, model_info, parameters):
        self.cv = cv
        self.model_info = model_info
        self.parameters = parameters
    def grid_search(self, x, y):
        x = np.array(x)
        y = np.array(y)
        grid_result = []
        for model in self.model_info.keys():
            search_rf = GridSearchCV(estimator = self.model_info[model],
                                     param_grid = self.parameters[model], 
                                     cv = self.cv,
                                     n_jobs = -1,
                                     verbose = 2)
            search_rf.fit(x, y)
            grid_result.append(search_rf.best_params_)
        return  grid_result
    def scores(self, model_list,train_x, train_y):
        model_names = []
        model_score = []
        for model in model_list:
            model.fit(train_x, train_y)
            model_names.append(model)
            model_score.append(model.score(train_x, train_y))
        result = pd.DataFrame({'models':model_names, 'socres':model_score})
        return result
            

In [None]:
# Grid Search
model_info = {'rf':RandomForestClassifier(), 'xgb':xgb.XGBClassifier(), 'ert':ExtraTreesClassifier(),
              'lr':LogisticRegression(), 'knn':KNeighborsClassifier(), 'svc':SVC()}

# Random Forset
grid_rf = {
    "n_estimators": np.linspace(100,1000,5, dtype = int),
    "max_depth": [3,5,7],
    "max_features": [3,5,7,9,11],
    "min_samples_leaf": [3,5,7],
    "min_samples_split":[3,5,7],
    "random_state": [2020,2021]
}

grid_xgb = {
    'booster': ['gbtree'],
    'objective': ['binary:logistic'],
    'subsample': [0.6,0.7,0.8],
    'colsample_bytree': [0.6,0.7,0.8],
    'eta': [0.05,0.1,0.2],
    'max_depth': [3,5],
    'seed': [2020, 2021],
    'eval_metric': ['logloss']
}

grid_ert = {
    "n_estimators": np.linspace(100,900,5, dtype = int),
    "max_depth": [3,5,7],
    "max_features": [3,5,6,7],
    "min_samples_leaf": [3,5,7],
    "min_samples_split":[3,5,7],
    "random_state": [2020, 2021]
}

grid_lr = {
    "penalty": ['l1', 'l2'],
    "C":[0.01, 0.05, 0.1, 0.25, 0.5],
    "random_state": [2020, 2021]
}

grid_knn = {
    "n_neighbors": [5,7,10,15,20]
}

grid_svc = {
    "C":[0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5],
    "gamma":[0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5],
    "random_state": [2020, 2021]
}

parameters = {'rf':grid_rf, 'xgb':grid_xgb, 'ert':grid_ert,
              'lr':grid_lr, 'knn':grid_knn, 'svc':grid_svc}

In [None]:
new_train = data2(df_train).new()
new_test = data2(df_test).new()

train_x = new_train.drop(columns = 'Survived')
train_y = new_train.Survived
test_x = new_test

(best_rf_1, best_xgb_1, best_ert_1, best_lr_1, best_knn_1, best_svc_1) = model_selection(3, model_info, parameters).grid_search(train_x, train_y)

In [None]:
# create models
model_rf = RandomForestClassifier(**best_rf_1)

model_xgb = xgb.XGBClassifier(**best_xgb_1)

model_ert = ExtraTreesClassifier(**best_ert_1)

model_lr = LogisticRegression(**best_lr_1)

model_knn = KNeighborsClassifier(**best_knn_1)

model_svc = SVC(**best_svc_1)

In [None]:
model_list = [model_rf, model_xgb, model_ert, model_lr, model_knn, model_svc]
model_selection(3, model_info, parameters).scores(model_list,train_x, train_y)

In [None]:
new_train_2 = data3(df_train).new()
new_test_2 = data3(df_test).new()

train_x_2 = new_train_2.drop(columns = 'Survived')
train_y_2 = new_train_2.Survived
test_x_2 = new_test_2
(best_rf_2, best_xgb_2, best_ert_2, best_lr_2, best_knn_2, best_svc_2) = model_selection(3, model_info, parameters).grid_search(train_x_2, train_y_2)

In [None]:
# create models
model_rf = RandomForestClassifier(**best_rf_2)

model_xgb = xgb.XGBClassifier(**best_xgb_2)

model_ert = ExtraTreesClassifier(**best_ert_2)

model_lr = LogisticRegression(**best_lr_2)

model_knn = KNeighborsClassifier(**best_knn_2)

model_svc = SVC(**best_svc_2)

In [None]:
model_list = [model_rf, model_xgb, model_ert, model_lr, model_knn, model_svc]
model_selection(3, model_info, parameters).scores(model_list,train_x_2, train_y_2)

# Ensemble Generation

In [None]:
class ensemble:
    def __init__(self, cv, base_models, target_model):
        self.cv = cv
        self.models = base_models
        self.target = target_model
    def predict(self, train_x, train_y, test_x):
        train_x = np.array(train_x)
        train_y = np.array(train_y)
        test_x = np.array(test_x)
        kf =KFold(n_splits = self.cv, shuffle=True, random_state=2021)
        result1 = np.zeros((train_x.shape[0],len(self.models)))
        result2 = np.zeros((test_x.shape[0],len(self.models)))
        for ind1, model in enumerate(self.models):
            result3 = np.zeros((test_x.shape[0], self.cv))
            for ind2, (train_index, test_index) in enumerate(kf.split(train_x)):
                f_x = train_x[train_index,:]
                f_y = train_y[train_index]
                s_x = train_x[test_index,:]
                model.fit(f_x,f_y)
                pred1 = model.predict(s_x)[:]
                pred2 = model.predict(test_x)[:]
                result1[test_index,ind1] = pred1
                result3[:,ind2] = pred2
            result2[:,ind1] = result3.mean(1)
        self.target.fit(result1,train_y)
        y_pred = self.target.predict(result2)[:]
        return y_pred
  

In [None]:
base_models = [model_rf, model_xgb, model_ert, model_lr, model_knn, model_svc]
target_ert = xgb.XGBClassifier()

target_model = target_ert

In [None]:
pred = ensemble(5, base_models, target_model).predict(train_x, train_y, test_x)

In [None]:
res = pd.DataFrame({'PassengerId':df_test.PassengerId, 'Survived': pred})
res.to_csv('result_stacking.csv', index=False)