# Titanic Competition

## Imports and config

In [26]:
# essentials
import numpy as np
import scipy
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import random

# scipy
from scipy.stats import kstest, boxcox, skew, norm, boxcox_normmax, yeojohnson
from scipy.special import boxcox1p

# models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

## Data loading

In [27]:
def read_data():
    train = pd.read_csv("/kaggle/input/titanic/train.csv")
    test = pd.read_csv("/kaggle/input/titanic/test.csv")
    return train, test

train, test = read_data()

In [28]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [29]:
train.shape

(891, 12)

In [30]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [31]:
test.shape

(418, 11)

No columns present in one data set and not in the other which is convinient.

## EDA

In [32]:
def drop_ID(X):
    """
    Drop the ID column since it is unique and so useless for ML
    
    Args:
        X (pandas dataframe) : dataframe whose ID col we want to trop
    
    Returns:
        X (pandas dataframe) : dataframe with ID's dropped
    """
    X.drop(['PassengerId'], axis = 1, inplace = True)
    return X

train = drop_ID(train)
test = drop_ID(test)
train.shape, test.shape

((891, 11), (418, 10))

In [33]:
def drop_target(train, target_name):
    """
    Drop the target from the train data
    
    Args:
        train (pandas dataframe) : train data
        target_name (string) : name of target
        
    Returns:
        train (pandas dataframe) : train data with target dropped
        y_train (pandas series) : target
    """
    y_train = train[target_name].reset_index(drop=True)
    train = train.drop([target_name], axis=1) # drop the target
    return y_train, train

target_name = "Survived"
target, train = drop_target(train, target_name)
target.shape, train.shape

((891,), (891, 10))

In [34]:
def combine_train_and_test(train, test):
    """
    Prepare datasets for cleaning by dropping target and combining into one dataset
    
    Args:
        train (pandas dataframe) : train dataset
        test (pandas dataframe) : test dataset
    
    Returns:
        X (pandas dataframe) : combined data
    """
    X = pd.concat([train, test]).reset_index(drop=True) # combine train and test
    
    return X

X = combine_train_and_test(train, test)
X.shape

(1309, 10)

In [35]:
def percent_missing(data):
    """
    Characterise missing data in a dataframe
    
    Args:
        df (pandas dataframe) : dataframe we want to inspect
    
    Returns:
        miss_df()
    """
    miss_df = ((data == 0).sum()).to_frame()
    miss_df = miss_df.rename(columns = {0: 'zeros'})
    miss_df.index.name = 'Feature'
    miss_df['np.nan'] = (data.isnull()).sum()
    miss_df['None'] = (data == "None").sum()
    miss_df['total'] = miss_df['zeros'] + miss_df['np.nan'] + miss_df['None']
    miss_df['Percent'] = 100*miss_df['total']/len(data)
    miss_df['Type'] = [data[i].dtype for i in miss_df.index ]
    return miss_df.sort_values(ascending = False, by = 'Percent')

#miss_all = percent_missing(X)
#miss_all

In [36]:
def handle_missing(X):    
    # drop useless columns
    X.drop(["Ticket", "Name", "Cabin"], axis = 1, inplace = True)

    # fill with mode
    X["Embarked"].fillna(X["Embarked"].mode(), inplace = True)
    X["Fare"].fillna(X["Fare"].mode(), inplace = True)

    # encode categorical
    X['Sex'].replace({'male':0, 'female':1}, inplace=True)
    X['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return X

train = handle_missing(train)

In [37]:
miss_all = percent_missing(train)
miss_all

Unnamed: 0_level_0,zeros,np.nan,None,total,Percent,Type
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Parch,678,0,0,678,76.094276,int64
Embarked,644,2,0,646,72.502806,float64
SibSp,608,0,0,608,68.237935,int64
Sex,577,0,0,577,64.758698,int64
Age,0,177,0,177,19.86532,float64
Fare,15,0,0,15,1.683502,float64
Pclass,0,0,0,0,0.0,int64
