In [1]:
# This is the main notebook for the Titanic project. In this notebook we will
# preprocess the data for machine learning. Try different machine learning models
# and select the best one. Also we will try different combinations of features
# (or engineered features) to find the better set of features for machine learning.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

import matplotlib.image as mpimg

from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import lightgbm as lgb
from xgboost import XGBClassifier

from sklearn.ensemble import StackingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier



In [3]:
path_to_train_data = 'train.csv'
path_to_test_data = 'test.csv'

train_total = pd.read_csv(path_to_train_data)
test_total = pd.read_csv(path_to_test_data)

In [4]:
# Creating a test and validation set

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=80)

for trainall_index, test_index in split.split(train_total, train_total["Survived"]):
    trainall = train_total.iloc[trainall_index]
    test = train_total.iloc[test_index]
    
trainall = trainall.reset_index(drop = True)
test = test.reset_index(drop = True)

for train_index, val_index in split.split(trainall, trainall["Survived"]):
    train = trainall.iloc[train_index]
    val = trainall.iloc[val_index]
    
train = train.reset_index(drop = True)
val = val.reset_index(drop = True)


In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,321,0,3,"Dennis, Mr. Samuel",male,22.0,0,0,A/5 21172,7.25,,S
1,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S
2,488,0,1,"Kent, Mr. Edward Austin",male,58.0,0,0,11771,29.7,B37,C
3,435,0,1,"Silvey, Mr. William Baird",male,50.0,1,0,13507,55.9,E44,S
4,731,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S


In [6]:
#############################################################################################
#############################################################################################
#############################################################################################
# Creating classes to preprocess the data

# Class for filling missing values
class fillmiss(BaseEstimator, TransformerMixin):
    def __init__(self, columns, strategy):
        self.columns = columns
        self.strategy = strategy
    def fit(self, X, y=None):
        for ind, item in enumerate(self.columns):
            misslen = sum(X[item].isnull())
            if misslen >= 0:
                if self.strategy[ind] == 'median':
                    self.med = X[item].median()
                if self.strategy[ind] == 'mode':
                    self.mod = X[item].mode()        
        return self # nothing else to do
    def transform(self, X):
        for ind, item in enumerate(self.columns):
            misslen = sum(X[item].isnull())
            if misslen > 0:
                if self.strategy[ind] == 'median':
                    X[item] = X[item].fillna(self.med)
                if self.strategy[ind] == 'mode':
                    X[item] = X[item].fillna(self.mod[0])
        return X

# Class for label encoding categorical variables

class enclab(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        from sklearn import preprocessing
        le = preprocessing.LabelEncoder()
        h = X
        for ind, item in enumerate(self.columns):
            loc = h.columns.get_loc(item)
            tc = le.fit_transform(h[item])
            h = h.drop(item,axis = 1)
            h.insert(loc, item, tc)
        return h

# Class for adding two attributes

class addattr(BaseEstimator, TransformerMixin):
    def __init__(self, columns, newcol):
        self.columns = columns
        self.newcol = newcol
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        X[self.newcol] = X[self.columns[0]] + X[self.columns[1]]
        return X

# Class for discretizing continuous data

class disccont(BaseEstimator, TransformerMixin):
    def __init__(self, columns, newcols , intervals):
        self.columns = columns
        self.newcols = newcols
        self.intervals = intervals
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        labelencoder = LabelEncoder()
        for ind,item in enumerate(self.columns):
            custom_bucket_array = np.array(self.intervals[ind])
            X[self.newcols[ind]] = pd.cut(X[item], custom_bucket_array, labels = np.arange(len(self.intervals[ind])-1))
        return X

# Class for creating a Master. column

class master(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        X['master'] = 0
        for ind, item in enumerate(X['titles']):
            if (item == 'Master'):
                X.loc[ind,'master'] = 1
        return X

# Class for extracting titles from the Name column

class title(BaseEstimator, TransformerMixin):
    def __init__(self, column, titles):
        self.column = column
        self.titles = titles
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        leng = len(X[self.column])
        ser = pd.Series((-1*np.ones(leng)))
        for item in self.titles:
            for ind,row in enumerate(X[self.column]):
                pos = row.find(item)
                if pos != -1:
                    ser[ind] = item
        ser[ser == -1] = 'other'
        
        X['titles'] = ser
        return X


# Class for extracting lastnames from the Name column  

class lastname(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        leng = len(X['Name'])
        serr = pd.Series((-1*np.ones(leng)))
        for ind,row in enumerate(X['Name']):
            pos = row.find(',')
            serr[ind] = row[:pos]
        X.insert(0,"lastname", serr)
        return X
    
# Class for Standard scaling a few columns in the dataframe

class stsc(BaseEstimator, TransformerMixin):
    def __init__(self,cols):
        self.cols = cols
    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X[self.cols])
        return self # nothing else to do
    def transform(self, X):
        X[self.cols] = self.ss.transform(X[self.cols])
        return X

# Class for dropping a few columns in the dataframe

class dropcol(BaseEstimator, TransformerMixin):
    def __init__(self,coldrop):
        self.coldrop = coldrop
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        for item in self.coldrop:
            if item in X.columns:
                X = X.drop(item,axis=1)
        return X    

# Class for getting dummy variables

class getdummy(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        X = pd.get_dummies(X, drop_first=True)
        return X

########################################################################################
########################################################################################
# Some Classes that were NOT useful:

# Extracting all the titles (this class by itself improves the results but
# compared to the title class above it gives worse results)

class titles(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        tser = []
        for row in X['Name']:
            pos1 = row.find(',')
            pos2 = row.find('.')
            tser.append(row[pos1+2:pos2])
        X['titles'] = np.array(tser)
        return X

# Extracting deck from the Ticket column
    
class deck(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        ser = []
        boo = X['Cabin'].isnull()
        for item in X[~boo]['Cabin']:
            ser.append(item[0])
        X['deck'] = 1
        ind1 = X.index[boo]
        ind2 = X.index[~boo]
        X.loc[ind1,'deck'] = None
        X.loc[ind2,'deck'] = ser
        return X

# This class creates a column that is 1 when the passengers
# have a nickname otherwise it is zero.

class nickname(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        ser=[]
        par='('
        for item in X['Name']:
            if item.find(par) != -1:
                ser.append(1)
            else:
                ser.append(0)
        X['nickname'] = ser
        return X

In [7]:
# Pipelines


# Pipelines that had a positive effect on the machine learning process

fmiss = fillmiss(columns = ['Embarked', 'Age', 'Fare'], strategy = ['mode', 'median', 'median'])

el = enclab(['Sex','Embarked'])

aattr = addattr(columns = ['SibSp','Parch'],newcol = 'famem')

age60 = disccont(columns = ["Age"], newcols = ["age60"] , intervals = [[-1,60,200]])

age0 = disccont(columns = ["Age"], newcols = ["age0"] , intervals = [[-1,5,200]])

fare0 = disccont(columns = ["Fare"], newcols = ['fare0'] , intervals = [[-1,0.01,800]])

fare300 = disccont(columns = ["Fare"], newcols = ['fare300'] , intervals = [[-1,300,800]])

titlec = title(column = "Name", titles = ['Mr' , 'Mrs' , 'Miss' 
                                          , 'Dr', 'Master'])
mtr = master()

lname = lastname()

ss = stsc(cols= ['Age','Fare'])

drcol = dropcol(coldrop =['Survived','Name','PassengerId'])

gdummy = getdummy()


# pipelines that did not have any effect on the prediction score

aattr = addattr(columns = ['SibSp','Parch'],newcol = 'famem')

fared = disccont(columns = ["Fare"], newcols = ['fared'] , intervals = [[-1,0.01,25,50
                                                                         ,75,100,300,800]])

# pipelines that worsened the prediction score

nkn = nickname()

famem4 = disccont(columns = ["famem"], newcols = ["famem4"] , intervals = [[-1,3.5,12]])


    #tls compared to titlec worsens the prediction score.
tls = titles()

dck = deck()


# full pipeline

full_pipeline = Pipeline([('fillmiss',fmiss) ,('enclab',el)
                          , ('addattr', aattr)
#                          ,('famem4', famem4)
#                          , ('nkn',nkn)
#                          , ('fared',fared)
                          , ('age60', age60)
                          , ('fare0', fare0),('fare300', fare300)
                          , ('title',titlec)
#                          , ('tls',tls)
                          , ('master', mtr)
                          , ('lastname', lname)
#                          ,('deck',dck)
                          , ('ss',ss)
                          , ('drcol', drcol)
                          , ('dummy', gdummy)
                         ])


In [8]:

# Transforming the train and validation sets

trainm = full_pipeline.fit_transform(train.copy())
valm = full_pipeline.transform(val.copy())

# Transforming the trainall and test set

trainallm = full_pipeline.fit_transform(trainall.copy())
testm = full_pipeline.transform(test.copy())


# Finding the common columns between trainm and valm datasets

comcolval = list(set(trainm.columns)&set(valm.columns))

trainm = trainm[comcolval]

valm = valm[comcolval]

# Finding the common columns between trainallm and testm datasets

comcoltest = list(set(trainallm.columns)&set(testm.columns))

trainallm = trainallm[comcoltest]

testm = testm[comcoltest]


In [9]:
########################################################################################
########################################################################################
########################################################################################
## Machine Learning

# In this section we will try different machine learning models and pick the best one.
# The result is that Support Vector Classifier with linear kernel gives the best
# prediction score.
# Hyper-parameter tuning is not shown here but only the selected parameters are shown.

In [10]:
########################################################################################
########################################################################################
## Validation

In [11]:
########################################################################################
# Random Forests

rf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 1000,
                               min_samples_split = 10, min_samples_leaf = 1,
                            max_depth = 20, random_state=50)

rf.fit(trainm.values,train['Survived'].values.ravel())

ypredrf = rf.predict(valm.values)

rf.score(valm.values,val['Survived'].values.ravel())

0.8111888111888111

In [12]:
########################################################################################
# Support Vector Classifier

svc = SVC(kernel='linear')

svc.fit(trainm.values,train['Survived'].values.ravel())

ypredsvc = svc.predict(valm.values)

svc.score(valm.values,val['Survived'].values.ravel())

0.8251748251748252

In [13]:
########################################################################################
# Gradient Boosting Classifier

grbc = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.05,
                                   max_depth=3, max_features='sqrt',
                                   min_samples_leaf=10, 
                                   loss='deviance', random_state = 5)

grbc.fit(trainm.values,train['Survived'].values.ravel())

ypredgrbc = grbc.predict(valm.values)

grbc.score(valm.values,val['Survived'].values.ravel())

0.7762237762237763

In [14]:
########################################################################################
# extreme Gradient Boosting Classifier

xg = XGBClassifier(colsample_bytree =0.2, gamma = 0.0468,reg_lambda=0.4,
                             learning_rate=0.05, max_depth=3, 
                              reg_alpha = 0.15,
                            n_estimators = 1000,subsample=0.525,
                             random_state =7, nthread = -1)

xg.fit(trainm.values,train['Survived'].values.ravel())

ypredxg = xg.predict(valm.values)

xg.score(valm.values,val['Survived'].values.ravel())

0.7902097902097902

In [15]:
########################################################################################
# Logisitc Regression

lr = LogisticRegression()

lr.fit(trainm.values,train['Survived'].values.ravel())

ypredlr = lr.predict(valm.values)

lr.score(valm.values,val['Survived'].values.ravel())

0.8041958041958042

In [16]:
########################################################################################
# Stacking

level0 = list()
level0.append(('svc', svc))
level0.append(('rf',  rf))
level0.append(('lr', lr))
level0.append(('gb', grbc))
level0.append(('xgb', xg))
#level0.append(('lr', modellr))

# define meta learner model
#level1 = LogisticRegression()

level1 = SVC(kernel = 'linear')

# define the stacking ensemble
stackmodel = StackingClassifier(estimators=level0, final_estimator=level1,
                                cv=5)

stackmodel.fit(trainm.values,train['Survived'].values.ravel())

ypredstack = stackmodel.predict(valm.values)

stackmodel.score(valm.values,val['Survived'].values.ravel())

0.8181818181818182

In [17]:
########################################################################################
# Averaging over different random splits of trainall into train and val.

# There are some fluctuations in the validation scores for different random
# splits of trainall therefore it is better to average over the score of different
# random splits of the trainall set into train and val sets.

rf = RandomForestClassifier(max_features = 'sqrt', n_estimators = 1000,
                               min_samples_split = 10, min_samples_leaf = 1,
                            max_depth = 20, random_state=50)

svc = SVC(kernel='linear')

grbc = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=10, 
                                   loss='deviance', random_state = 5)

xg = XGBClassifier(colsample_bytree =0.2, gamma = 0.047,reg_lambda=0.4,
                             learning_rate=0.05, max_depth=4, 
                              reg_alpha = 0.15,
                            n_estimators = 1000,subsample=0.525,
                             random_state =7, nthread = -1)

lr = LogisticRegression()




scores = []

model = svc # or any other model above like: rf, grbc, etc


for i in range(20):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=i)

    for train_index, val_index in split.split(trainall, trainall["Survived"]):
        train = trainall.iloc[train_index]
        val = trainall.iloc[val_index]
    
    train = train.reset_index(drop = True)
    val = val.reset_index(drop = True)
    
    trainm = full_pipeline.fit_transform(train.copy())
    valm = full_pipeline.transform(val.copy())

    comcolval = list(set(trainm.columns)&set(valm.columns))
    trainm = trainm[comcolval]
    valm = valm[comcolval]
    
    model.fit(trainm.values,train['Survived'].values.ravel())
    scores.append(model.score(valm.values,val['Survived'].values.ravel()))

In [18]:
scores = np.array(scores)
scores.mean()

0.8461538461538461

In [19]:
########################################################################################
########################################################################################
## Testing

In [20]:

modelt = svc

modelt.fit(trainallm.values,trainall['Survived'].values.ravel())

ypredtest = modelt.predict(testm.values)

svc.score(testm.values,test['Survived'].values.ravel())

0.8547486033519553

In [21]:
########################################################################################
########################################################################################
### Predicitons for submission to Kaggle

In [22]:
train_totalm = full_pipeline.fit_transform(train_total.copy())
test_totalm = full_pipeline.transform(test_total.copy())


comcoltot = list(set(train_totalm.columns)&set(test_totalm.columns))

train_totalm = train_totalm[comcoltot]

test_totalm = test_totalm[comcoltot]

In [23]:
modelsvc = SVC(kernel = 'linear')

modelsvc.fit(train_totalm.values,train_total['Survived'].values.ravel())

ypredtotsvc = modelsvc.predict(test_totalm.values)


In [24]:
pred = pd.DataFrame()
pred['PassengerId'] = test_total['PassengerId']
pred['Survived'] = ypredtotsvc
pred.to_csv('submissionsvc.csv',index=False)

In [25]:
# Submitting the above file submissionsvc.csv to Kaggle gives an accuracy score of 79.425%
# which puts the submission in the top 10% on the public leaderboard.