In [12]:
import numpy as np 
import pandas as pd 
import scipy as sp
import sklearn 

import random
import time

# Model Algoriths
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier # conda install -c conda-forge xgboost

#Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

# Files live in the same folder as this notebook. 
submission_example = pd.read_csv('gender_submission.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Making a copy to play with. 
data1 = train.copy(deep = True)

# Making a list of both trains so we can clean them at once later.
data_cleaner = [data1, train]

Now that we're familiar with the data we need to first clean it.

The 4 C's:
Correcting - remove broken data. Like if age is 800 somewhere or something. Doesn't look like it.

Completing - filling null values. Many algorithms don't know how to deal so we need to fix. We need to impute missing values especially for age. We might need to change this process if we realize that filling it with the mean or something isn't working well. What I'm reading suggests we should use the median for age, drop the 'cabin' column and use mode to impute 'embark'. 

Create - Feature engineering

Converting - changing over dates or data types that don't work well. 

In [19]:
# train.info()
# train.describe()
# train.sample(10)

# going to work on having prettier print functions in this notebook.

# print('Train columns with null values:\n', data1.isnull().sum())
# print("-"*10)

# print('Test columns with nulls:\n', test.isnull().sum())
# print("-"*10)

# looks like the ratio of missing age and cabin are the same across the train and test sets.
# proof the sample is actually random between the two. 
# We need to fix these two columns along with Embarked if we can hope to model correctly.
# in the future I would make several different versions of these dataframes,
# testing different imputation methods to see which one works the best. 

In [16]:
# Filling the data

for dataset in data_cleaner: # do em both at once
    # Fill missing age
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True) # this doesn't work well without inplace
    
    # fill embarked
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    
    # fill missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    # we need to drop Passenger ID and ticket because they're just random identifiers with no purpose
    # we also want to drop Cabin because it has too many Nulls

drop_column = ['PassengerId', 'Cabin', 'Ticket'] # make a list it's easier
data1.drop(drop_column, axis=1, inplace = True) # axis means column, inplace makes it persistent without needing to make a new variable



# Time to create some features for both datasets. 

for dataset in data_cleaner:
    # How about creating family size per person? 
    # makes sense that families would work together to survive
    # and families prioritized in lifeboats
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 # plus one to account for the person themselves
    
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    # if you are alone, it's a 1, if not, it's a zero
    # this is a binary column
    
    # The names have titles with them, "Mr" "Miss" "Master" so let's cut those off and turn them into a feature!
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    
    # We should also place the fares into bins. I don't really know how this works but I'm going to try qcut
    # Ref: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
    # Ref: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    
    # Using regular cut to bin the ages:
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    
    

# So here's where I'm getting some guidance from other notebooks. We need to clean up rare title names.
# print(data1['Title'].value_counts())
# Like, lmaowtf: https://en.wikipedia.org/wiki/Jonkheer

stat_min = 10 
# Using ten as the minimum because this article says so: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
title_names = (data1['Title'].value_counts() < stat_min) # creates a true/false series with title name as the index

# What we're going to do is replace the random ones below ten. 
# lambda functions 

data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(data1['Title'].value_counts())

We need to convert categorical data to dummy variables for mathematical analysis. We're going to encode using the inherent sklearn and pandas tools, nothing fancy. 

I know that SciKit has a new library called ColumnTransformer that has replaced LabelEncoding but I haven't learned how to use it yet. 

https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621

Let's try to use this next time:
https://towardsdatascience.com/columntransformer-in-scikit-for-labelencoding-and-onehotencoding-in-machine-learning-c6255952731b


In [18]:
label = LabelEncoder()

In [None]:
for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])

# We now can define a y variable, the target outcome:
Target = ['Survived']

# Defining the X variables for feature selection
data1_x = ['Sex', 'Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] # pretty names for charts
data1_x_calc = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'SibSp', 'Parch', 'Age', 'Fare'] # These are the actual coded columns we're gonna use

data1_xy = Target + data1_x # combining them 

# Define the x variables for the origiunal data with bin features to remove any continuous variables
data1_x_bin = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']