# Imports 

In [57]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split

# load Data

In [43]:
data_raw = pd.read_csv('./../../dataset/titanic/train.csv')
data_val = pd.read_csv('./../../dataset/titanic/test.csv')

In [44]:
data1 = data_raw.copy(deep=True)
# create a DataFrame list
data_cleaner = [data1, data_val]
print(data_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [45]:
data_raw.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
326,327,0,3,"Nysveen, Mr. Johan Hansen",male,61.0,0,0,345364,6.2375,,S
197,198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42.0,0,1,4579,8.4042,,S
696,697,0,3,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,,S
40,41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.0,1,0,7546,9.475,,S
470,471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S


In [46]:
data1.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
119,120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2.0,4,2,347082,31.275,,S
663,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
587,588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60.0,1,1,13567,79.2,B41,C
176,177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S


# The 4 C's: Correcting, Completing, Creating and Converting

- Correcting: deleting or fixing unreasonable data e.g.: age = 800
- Completing: Imputing/deleting null values
- Creating: use features to create new features
- Converting: Turn object datatypes into categorical dummy variables

In [47]:
print('Train data with null values:\n', data1.isnull().sum())
print('-'*20)
print('Test data with null values:\n', data_val.isnull().sum())
print('-'*20)

Train data with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
--------------------
Test data with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
--------------------


## Completing Data

In [48]:
print(data_raw['Embarked'].mode())
print(type(data_raw['Embarked'].mode()))

0    S
dtype: object
<class 'pandas.core.series.Series'>


In [49]:
for dataset in data_cleaner:
    # Complete age with median
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
    
    # Complete Embarked with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
    
    # Complete Fare with mean
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
    
 # Delete unuseful columns
data1.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)

print('Train data with null values:\n', data1.isnull().sum())
print('-'*20)
print('Test data with null values:\n', data_val.isnull().sum())

Train data with null values:
 Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
--------------------
Test data with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


## Create: Feature Engineering

In [50]:
for dataset in data_cleaner:
    # Create discrete variable familySize = #Sibilings/Spouses + #Parents/Children + 1 (the passenger)
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Create IsAlone variable 1 (alone) / 0 (FamilySize > 1)
    dataset['IsAlone'] = 1
    # Check if the familySize > 1 => 0
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    
    # Create title variable: 1 splits by ', ' then by '.' e.g.:
    # Sawyer, Mr. Frederick Charles -> ', ' => [0]: Sawyer [1]: Mr. Frederick Charles
    # Mr. Frederick Charles -> '.' => [0]: Mr [1]: Frederick Charles
    dataset['Title'] = dataset['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    
    # Create the FareBin feature (distributing the values in 4 quartiles)
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    
    # Create the AgeBin feature
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    
    # common minimum in statistics
    stat_min = 10
    # filter for rare title names
    title_names = (data1['Title'].value_counts() < stat_min)
    data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

print(data1['Title'].value_counts())
print('_'*30)
print(data1.info())
print('_'*30)
print(data_val.info())

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
______________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
FareBin       891 non-null category
AgeBin        891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.5+ KB
None
______________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name   

In [51]:
data1.sample(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
578,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,28.0,1,0,14.4583,C,2,0,Mrs,"(14.454, 31.0]","(16.0, 32.0]"
549,1,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,36.75,S,3,0,Master,"(31.0, 512.329]","(-0.08, 16.0]"
350,0,3,"Odahl, Mr. Nils Martin",male,23.0,0,0,9.225,S,1,1,Mr,"(7.91, 14.454]","(16.0, 32.0]"
749,0,3,"Connaghton, Mr. Michael",male,31.0,0,0,7.75,Q,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
399,1,2,"Trout, Mrs. William H (Jessie L)",female,28.0,0,0,12.65,S,1,1,Mrs,"(7.91, 14.454]","(16.0, 32.0]"
217,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42.0,1,0,27.0,S,2,0,Mr,"(14.454, 31.0]","(32.0, 48.0]"
175,0,3,"Klasen, Mr. Klas Albin",male,18.0,1,1,7.8542,S,3,0,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
327,1,2,"Ball, Mrs. (Ada E Hall)",female,36.0,0,0,13.0,S,1,1,Mrs,"(7.91, 14.454]","(32.0, 48.0]"
731,0,3,"Hassan, Mr. Houssein G N",male,11.0,0,0,18.7875,C,1,1,Mr,"(14.454, 31.0]","(-0.08, 16.0]"
196,0,3,"Mernagh, Mr. Robert",male,28.0,0,0,7.75,Q,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"


### Converting: Convert objects to category using Encoders

In [52]:
encoder = LabelEncoder()
for dataset in data_cleaner:
    dataset['Sex_Code'] = encoder.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = encoder.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = encoder.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = encoder.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = encoder.fit_transform(dataset['FareBin'])

# Target variable
target = ['Survived']

#feature selection
data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']

# coded for algorithm calculation
data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare']

# Original X Y data
data1_xy = target + data1_x
print('Original XY: ', data1_xy)

Original XY:  ['Survived', 'Sex', 'Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']


In [53]:
# define x variables for original bin features to remove continuos variables
data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = target + data1_x_bin
print('Bin XY: ', data1_xy_bin)

Bin XY:  ['Survived', 'Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']


In [54]:
data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = target + data1_x_dummy
print('Dummy XY: ', data1_xy_dummy)

Dummy XY:  ['Survived', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Misc', 'Title_Miss', 'Title_Mr', 'Title_Mrs']


In [56]:
data1_dummy.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,0,22.0,7.25,2,0,0,1,0,0,1,0,0,0,1,0
1,1,1,0,38.0,71.2833,2,0,1,0,1,0,0,0,0,0,0,1
2,3,0,0,26.0,7.925,1,1,1,0,0,0,1,0,0,1,0,0
3,1,1,0,35.0,53.1,2,0,1,0,0,0,1,0,0,0,0,1
4,3,0,0,35.0,8.05,1,1,0,1,0,0,1,0,0,0,1,0


# Train_Test_Split

In [59]:
train1_X, test1_X, train1_y, test1_y = train_test_split(data1[data1_x_calc], data1[target], test_size=0.25, random_state=0)

train1_X_bin, test1_X_bin, train1_y_bin, test1_y_bin = train_test_split(data1[data1_x_bin], data1[target], test_size=0.25, random_state=0)

train1_X_dummy, test1_X_dummy, train1_y_dummy, test1_y_dummy = train_test_split(data1_dummy[data1_x_dummy], data1[target], test_size=0.25, random_state=0)