In [8]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5


In [10]:
#My solution to Kaggle Titanic Machine Learning from Disaster Challenge
#We have to predict survival for passengers onboard rms Titanic using various features describing passengers

#The stuff that we are going to need. 
import matplotlib.pyplot as plt
import pandas as pd    #Dataframe library to manipulate data
import numpy as np      
import re  #We'll be using regular expressions to extract the titles from people's names. Like Mr, Mrs, Count etc
#from sklearn.model_selection.cross_validate import KFold   #for k-fold cross-validation
#from sklearn.model_selection.cross_validate import cross_validate as cv      #cross-validation
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
import xgboost 
from sklearn.model_selection import GridSearchCV   #Support for Hyper-parameter Tuning


#Kaggle provides us with two files. train.csv for training our classifier and test.csv 
#for generating submissions. We read the data in two seperate pandas dataframes
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

#Make a copy of test for later use. 
test_orig = test[:]

#In order to avoid duplication of code owing to applying same operations to both dataframes
#we combine the test and train dataframes into one. We'll split them later at time of training.
seperator = train.shape[0] #get the length of training data to slie the combined data frame later
frames = [train, test]
titanic = pd.concat(frames)

#cabin has too many missing values. Isn't very important to survial, so we drop it.
titanic.drop(["cabin"], axis = 1, inplace = True)

#Senisble imputation of missing fare values
median_fare = titanic.loc[(titanic["pclass"] == 3) & (titanic["embarked"] == "S") & (titanic["age"] >= 55)].dropna()["fare"].median()
titanic["fare"] = titanic["fare"].fillna(median_fare)

def get_title(name):
    """
    Use a regular expression to search for a title.  titles always consist of
    capital and lowercase letters, and end with a period.
    
    Takes a name as input and returns the title string as output
    """

    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


titanic["title"] = titanic["name"].apply(get_title)  #We dropped "name" earlier. So, we use original data.

#Condense the title into smaller, and more meaningful categories.
title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royal",
                        "Don":        "Royal",
                        "Sir" :       "Royal",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "Countess":   "Royal",
                        "Dona":       "Royal",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royal"

                        }

def titlemap(x):
    return title_Dictionary[x]


titanic["title"] = titanic["title"].apply(titlemap)

#Fill in the missing age values by categorising the data and imputing the missing age values 
#in a particular category by the median age of that category.
#We could replace the age by the median but that would rob our dataset of precious variance 
#which is important in training our classifier to perform better.

def fillages(row):
    if not(np.isnan(row['age'])):
        return row['age']
    
    if row['gender']=='female' and row['pclass'] == 1:
        if row['title'] == 'Miss':
            return 30
        elif row['title'] == 'Mrs':
            return 45
        elif row['title'] == 'Officer':
            return 49
        elif row['title'] == 'Royalty':
            return 39

    elif row['gender']=='female' and row['pclass'] == 2:
        if row['title'] == 'Miss':
            return 20
        elif row['title'] == 'Mrs':
            return 30

    elif row['gender']=='female' and row['pclass'] == 3:
        if row['title'] == 'Miss':
            return 18
        elif row['title'] == 'Mrs':                
            return 31

    elif row['gender']=='male' and row['pclass'] == 1:
        if row['title'] == 'Master':
            return 6
        elif row['title'] == 'Mr':
            return 41.5
        elif row['title'] == 'Officer':
            return 52
        elif row['title'] == 'Royalty':
            return 40

    elif row['gender']=='male' and row['pclass'] == 2:
        if row['title'] == 'Master':
            return 2
        elif row['title'] == 'Mr':
            return 30
        elif row['title'] == 'Officer':
                return 41.5

    elif row['gender']=='male' and row['pclass'] == 3:
        if row['title'] == 'Master':
            return 6
        elif row['title'] == 'Mr':
            return 26
        
titanic["age"] = titanic.apply(fillages, axis = 1)

#a Rare title indicates towards a higher chances of survival and hence, more chnaces or survival
#We denote Rare titles by 1 and The common ones by 0
def isRare(title):
    if title == "Mr" or title == "Mrs" or title == "Master" or title == "Miss":
        return 0
    return 1

titanic["title"] = titanic["title"].apply(isRare)

#Combing Siblings, Spouses, Parents or children onboard to a single Family variable
titanic["Family"] = titanic["parch"] + titanic["sibsp"]

#Being a child improves your chances of survival. 
titanic["Child"] = 0
titanic.loc[titanic["age"] <= 18, "Child"] = 1


#gender is non-numeric data which can't be handled by our classifier. 
titanic.loc[titanic["gender"] == "male", "gender"] = 0    #set male to 0 and female to 1
titanic.loc[titanic["gender"] == "female", "gender"] =1

titanic["gender"] = titanic["gender"].astype(int)


#embarked is non-numeric data. Therefore we are going to build a couple of categorical variables to
#represent embarked
titanic["Q"] = 0
titanic.loc[titanic["embarked"] == "Q", "Q"] = 1

titanic["S"] = 0
titanic.loc[titanic["embarked"] == "S", "S"] = 1

# predictors = ["age", "Q", "S", "fare", "pclass", "gender", "Family", "title", "Child"] 0.7655

#The predictors that we are going to use
predictors = ["Q", "S", "fare", "pclass", "gender", "Family", "title", "Child"]

#Break the combined data set into test and train data
target = titanic["survived"].iloc[:seperator]
train = titanic[predictors][:seperator]
test = titanic[predictors][seperator:]


#Build an ensemble of classifiers. Hyper-parameters chosen through cross validation
xgb = xgboost.XGBClassifier(learning_rate = 0.05, n_estimators=500);
svmc = svm.SVC(C = 5, probability = True)

#fit the data
xgb.fit(train, target)
svmc.fit(train, target)

xgb_preds = xgb.predict_proba(test).transpose()[1]
svmc_preds = svmc.predict_proba(test).transpose()[1]

#Assign different weightages to the classifiers
ensemble_preds = xgb_preds*0.75 + svmc_preds*0.25

for x in range(len(ensemble_preds)):
    if ensemble_preds[x] >= 0.5:
        ensemble_preds[x] = 1
    else:
        ensemble_preds[x] = 0



results  = ensemble_preds.astype(int)

#Generate the final submission file.
submission = pd.DataFrame({"passengerid": test_orig["passengerid"], "survived": results}) 
submission.to_csv("data/kaggle1.csv", index=False)


