In [33]:
import numpy as np 
import pandas as pd 
import scipy as sp
import sklearn 

import random
import time

# Model Algoriths
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier # conda install -c conda-forge xgboost

#Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [34]:
# Files live in the same folder as this notebook. 
submission_example = pd.read_csv('gender_submission.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [35]:
# Making a copy to play with. 
data1 = train.copy(deep = True)

# Making a list of both trains so we can clean them at once later.
data_cleaner = [data1, train]

In [36]:
# train.info()
# train.describe()
# train.sample(10)

Now that we're familiar with the data we need to first clean it.

The 4 C's:
Correcting - remove broken data. Like if age is 800 somewhere or something. Doesn't look like it.

Completing - filling null values. Many algorithms don't know how to deal so we need to fix. We need to impute missing values especially for age. We might need to change this process if we realize that filling it with the mean or something isn't working well. What I'm reading suggests we should use the median for age, drop the 'cabin' column and use mode to impute 'embark'. 

Create - Feature engineering

Converting - changing over dates or data types that don't work well. 

In [37]:
# going to work on having prettier print functions in this notebook.

print('Train columns with null values:\n', data1.isnull().sum())
print("-"*10)

print('Test columns with nulls:\n', test.isnull().sum())
print("-"*10)

# looks like the ratio of missing age and cabin are the same across the train and test sets.
# proof the sample is actually random between the two. 
# We need to fix these two columns along with Embarked if we can hope to model correctly.
# in the future I would make several different versions of these dataframes,
# testing different imputation methods to see which one works the best. 

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test columns with nulls:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
----------


In [38]:
# Filling the data

for dataset in data_cleaner: # do em both at once
    # Fill missing age
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True) # this doesn't work well without inplace
    
    # fill embarked
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    
    # fill missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    # we need to drop Passenger ID and ticket because they're just random identifiers with no purpose
    # we also want to drop Cabin because it has too many Nulls

drop_column = ['PassengerId', 'Cabin', 'Ticket'] # make a list it's easier
data1.drop(drop_column, axis=1, inplace = True) # axis means column, inplace makes it persistent without needing to make a new variable



In [None]:
# Time to create some features for both datasets. 

for dataset in data_cleaner:
    # How about creating family size per person? 
    # makes sense that families would work together to survive
    # and families prioritized in lifeboats
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 # plus one to account for the person themselves
    
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    # if you are alone, it's a 1, if not, it's a zero
    # this is a binary column
    
    