# Titanic Competition

## Imports and config

In [1]:
# essentials
import numpy as np
import scipy
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import random

# scipy
from scipy import stats
from scipy.stats import kstest, boxcox, skew, norm, boxcox_normmax, yeojohnson
from scipy.special import boxcox1p

# models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

# misc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

## Data loading

In [2]:
def read_data():
    train = pd.read_csv("/kaggle/input/titanic/train.csv")
    test = pd.read_csv("/kaggle/input/titanic/test.csv")
    return train, test

train, test = read_data()

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.shape

(891, 12)

In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
test.shape

(418, 11)

No columns present in one data set and not in the other which is convinient.

## EDA

In [7]:
def drop_ID(X):
    """
    Drop the ID column since it is unique and so useless for ML
    
    Args:
        X (pandas dataframe) : dataframe whose ID col we want to trop
    
    Returns:
        X (pandas dataframe) : dataframe with ID's dropped
    """
    X.drop(['PassengerId'], axis = 1, inplace = True)
    return X

train = drop_ID(train)
test = drop_ID(test)
train.shape, test.shape

((891, 11), (418, 10))

## Prepare for cleaning

In [8]:
def drop_target(train, target_name):
    """
    Drop the target from the train data
    
    Args:
        train (pandas dataframe) : train data
        target_name (string) : name of target
        
    Returns:
        train (pandas dataframe) : train data with target dropped
        y_train (pandas series) : target
    """
    y_train = train[target_name].reset_index(drop=True)
    train = train.drop([target_name], axis=1) # drop the target
    return y_train, train

target_name = "Survived"
target, train = drop_target(train, target_name)
target.shape, train.shape

((891,), (891, 10))

In [9]:
def combine_train_and_test(train, test):
    """
    Prepare datasets for cleaning by dropping target and combining into one dataset
    
    Args:
        train (pandas dataframe) : train dataset
        test (pandas dataframe) : test dataset
    
    Returns:
        X (pandas dataframe) : combined data
    """
    X = pd.concat([train, test]).reset_index(drop=True) # combine train and test
    
    return X

X = combine_train_and_test(train, test)
X.shape

(1309, 10)

In [10]:
def split_train_and_test(X, target):
    """
    Extract the train and test sets from the combined data
    
    Args:
        X (pandas dataframe) : dataframe to split up
        target (pandas series) : target data
        
    Return:
        X_test (pandas dataframe) : test data
        X_train (pandas dataframe) : train data
    """
    X_train = X.iloc[:len(target), :]
    X_test = X.iloc[len(target):, :]
    return X_train, X_test

## Missing values

In [11]:
def percent_missing(data):
    """
    Characterise missing data in a dataframe
    
    Args:
        df (pandas dataframe) : dataframe we want to inspect
    
    Returns:
        miss_df()
    """
    miss_df = ((data == 0).sum()).to_frame()
    miss_df = miss_df.rename(columns = {0: 'zeros'})
    miss_df.index.name = 'Feature'
    miss_df['np.nan'] = (data.isnull()).sum()
    miss_df['None'] = (data == "None").sum()
    miss_df['total'] = miss_df['zeros'] + miss_df['np.nan'] + miss_df['None']
    miss_df['Percent'] = 100*miss_df['total']/len(data)
    miss_df['Type'] = [data[i].dtype for i in miss_df.index ]
    return miss_df.sort_values(ascending = False, by = 'Percent')

miss_all = percent_missing(X)
miss_all

Unnamed: 0_level_0,zeros,np.nan,None,total,Percent,Type
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cabin,0,1014,0,1014,77.463713,object
Parch,1002,0,0,1002,76.546982,int64
SibSp,891,0,0,891,68.067227,int64
Age,0,263,0,263,20.091673,float64
Fare,17,1,0,18,1.375095,float64
Embarked,0,2,0,2,0.152788,object
Pclass,0,0,0,0,0.0,int64
Name,0,0,0,0,0.0,object
Sex,0,0,0,0,0.0,object
Ticket,0,0,0,0,0.0,object


In [12]:
def handle_missing(X):    
    # drop useless columns
    X.drop(["Ticket", "Name", "Cabin"], axis = 1, inplace = True)

    # fill median after group transform
    X['Age'] = X.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    X['Fare'] = X.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))
    X["Embarked"] = X.groupby(["Pclass", "Sex"])["Embarked"].transform(lambda x: x.fillna(x.mode()[0]))

    # encode categorical
    X['Sex'].replace({'male':0, 'female':1}, inplace=True)
    X['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return X

X = handle_missing(X)

In [13]:
miss_all = percent_missing(X)
miss_all

Unnamed: 0_level_0,zeros,np.nan,None,total,Percent,Type
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Parch,1002,0,0,1002,76.546982,int64
Embarked,914,0,0,914,69.824293,int64
SibSp,891,0,0,891,68.067227,int64
Sex,843,0,0,843,64.400306,int64
Fare,17,0,0,17,1.298701,float64
Pclass,0,0,0,0,0.0,int64
Age,0,0,0,0,0.0,float64


In [14]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,3,0,35.0,0,0,8.05,0


## Modelling

In [15]:
# decouple the train and test data
train, test = split_train_and_test(X, target)
train.shape, test.shape, target.shape

((891, 7), (418, 7), (891,))

In [16]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=42)

In [17]:
def score(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    return accuracy_score(y_val, prediction)

In [18]:
#Lets some models
model1 = LogisticRegression(solver='liblinear', random_state=42)
model2 = GradientBoostingClassifier()
model3 = RandomForestClassifier()
model4 = SGDClassifier()
model5 = SVC()

models = [model1, model2, model3, model4, model5]
for i, model in enumerate(models):
    print("Model ", i,":", model)
    print("ACC: ", score(model, X_train, y_train, X_val, y_val))

Model  0 : LogisticRegression(random_state=42, solver='liblinear')
ACC:  0.8044692737430168
Model  1 : GradientBoostingClassifier()
ACC:  0.8156424581005587
Model  2 : RandomForestClassifier()
ACC:  0.8156424581005587
Model  3 : SGDClassifier()
ACC:  0.7653631284916201
Model  4 : SVC()
ACC:  0.6536312849162011


In [19]:
model = GradientBoostingClassifier(min_samples_split=20, min_samples_leaf=60, max_depth=3, max_features=7)
score(model, X_train, y_train, X_val, y_val)

0.8100558659217877

In [20]:
predictions = model2.predict(test)
test = pd.read_csv('/kaggle/input/titanic/test.csv')
output = pd.DataFrame({'PassengerId': test["PassengerId"], 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [21]:
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
