In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
trainData = pd.read_csv("./titanic/train.csv")
testData = pd.read_csv("./titanic/test.csv")

In [3]:
# copy of datasets, originals may be needed for reference
trainCopy = trainData.copy(deep = True)
testCopy = testData.copy(deep = True)
datasets = [trainCopy, testCopy]

In [4]:
# find incomplete columns
print("Sums of incomplete TRAINING values:")
print(trainCopy.isnull().sum())
print("------------------------------")
print("Sums of incomplete TEST values:")
print(testCopy.isnull().sum())

Sums of incomplete TRAINING values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
------------------------------
Sums of incomplete TEST values:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [5]:
# since Name is a complete feature, we will use it to engineer the "Title" feature
for dataset in datasets:
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [6]:
print("Titles in TRAIN:")
print(trainCopy['Title'].value_counts())
print("-------------------")
print("Titles in TEST:")
print(testCopy['Title'].value_counts())

Titles in TRAIN:
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: Title, dtype: int64
-------------------
Titles in TEST:
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: Title, dtype: int64


In [7]:
# replace "rare" (fewer than 10 instances) and foreign titles with english equivalent
for dataset in datasets:
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    dataset['Title'] = dataset['Title'].replace(['Lady', 'the Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 
                                                 'Sir', 'Jonkheer', 'Dona'], 
                                                'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
    # because a number decks housed first-, second-, and third-class passengers, 
    # we doubt the significance of the Cabin feature and will not attempt to complete it
    dataset.drop('Name', axis = 1, inplace = True)
    dataset.drop('Cabin', axis = 1, inplace = True)
    dataset.drop('Ticket', axis = 1, inplace = True)          # Ticket also appears to be insignificant
    dataset.drop('PassengerId', axis = 1, inplace = True)     # as does PassengerId

In [8]:
print("Titles in TRAIN:")
print(trainCopy['Title'].value_counts())
print("-------------------")
print("Titles in TEST:")
print(testCopy['Title'].value_counts())

Titles in TRAIN:
Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: Title, dtype: int64
-------------------
Titles in TEST:
Mr        240
Miss       79
Mrs        72
Master     21
Rare        6
Name: Title, dtype: int64


In [9]:
# now that we have the Title attribute, we will complete the Age feature using the
# median age associated with each title, and complete the rest of the features
for dataset in datasets:
    dataset['Age'] = dataset.groupby('Title', as_index = True)['Age'].apply(lambda age: age.fillna(age.median()))
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
#     dataset.groupby('Title')['Age'].mean()

In [10]:
# feature engineering

# FamilySize = siblings + spouse + parents + children
# AgeClassInteraction = product of age and passenger class
for dataset in datasets:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['AgeClassInteraction'] = dataset['Age'] * dataset['Pclass']

In [11]:
# data discretization and categorical encoding
label = LabelEncoder()
for dataset in datasets:  
    
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)

    #Age Bins/Buckets using cut or value bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])

In [12]:
trainCopy.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Title', 'FamilySize', 'AgeClassInteraction', 'FareBin',
       'AgeBin', 'Sex_Code', 'Embarked_Code', 'Title_Code', 'AgeBin_Code',
       'FareBin_Code'],
      dtype='object')

In [13]:
x = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'FamilySize', 'Age', 'FareBin_Code', 'AgeClassInteraction']

kNeighbors = KNeighborsClassifier(n_neighbors=10, algorithm='brute').fit(trainCopy[x], trainCopy['Survived'])

In [14]:
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )
cv_results = model_selection.cross_validate(kNeighbors, trainCopy[x], trainCopy['Survived'], cv  = cv_split)
cv_results['test_score'].max()

0.7611940298507462