In [1]:
import pandas as pd
import numpy as np
import scipy as sp 

import acquire
from prepare import split, impute, encode, scale_minmax, prepare

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = acquire.get_titanic_data()

In [3]:
df.drop(columns=['passenger_id','embarked','deck'], inplace=True)
df.fillna(np.nan, inplace=True)

How many missing values are there?

In [4]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
class            0
embark_town      2
alone            0
dtype: int64

In [5]:
len(df)

891

- **Age** There are 177/891 observations missing an age value. I'm not sure if it would be safe to impute here with a single value. I may need to split into child and adult before imputing, but I will likely go with median if I impute at all, and either way. i.e. am I going to fit the impute function on entire set or on sub-groups. 
- **Task:** Create a impute median function

- **Embark_town** I see only 2 in embark_town, so I can safely impute here...let's see the best method:

In [6]:
df.embark_town.value_counts(dropna=False)

Southampton    644
Cherbourg      168
Queenstown      77
NaN              2
Name: embark_town, dtype: int64

**Task**: We should use the 'mode' or most frequent value to impute here, given that the vast majority come from Southamption

Split data: 

In [7]:
train, test = train_test_split(df, train_size=.80, stratify=df.survived)

Build impute function to impute the mode:

In [8]:
def impute_mode(train, test, column_list):
    imputer = SimpleImputer(strategy='most_frequent')
    train[column_list] = imputer.fit_transform(train[column_list])
    test[column_list] = imputer.transform(test[column_list])
    return train, test

Run function on embark_town

In [9]:
train, test = impute_mode(train, test, column_list = ['embark_town'])

Verify missing values were filled

In [10]:
print(train.embark_town.isnull().sum() + test.embark_town.isnull().sum())

0


Build the impute_median function

In [11]:
def impute_median(train, test, column_list):
    imputer = SimpleImputer(strategy='median')
    train[column_list] = imputer.fit_transform(train[column_list])
    test[column_list] = imputer.transform(test[column_list])
    return train, test

We can merge these 2 functions into a single function where strategy is an argument of the function

In [12]:
def impute(train, test, my_strategy, column_list):
    imputer = SimpleImputer(strategy=my_strategy)
    train[column_list] = imputer.fit_transform(train[column_list])
    test[column_list] = imputer.transform(test[column_list])
    return train, test

In [13]:
train, test = impute(train, test, my_strategy = 'most_frequent', column_list = ['embark_town'])

In [14]:
t1,t2 = impute(train, test, my_strategy = 'median', column_list = ['age'])
t1.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
class          0
embark_town    0
alone          0
dtype: int64

In [15]:
def encode(train, test, col_name):
    
    encoded_values = sorted(list(train[col_name].unique()))
    
    # Integer Encoding
    int_encoder = LabelEncoder()
    train.encoded = int_encoder.fit_transform(train[col_name])
    test.encoded = int_encoder.transform(test[col_name])
    
    # create 2D np arrays of the encoded variable (in train and test)
    train_array = np.array(train.encoded).reshape(len(train.encoded),1)
    test_array = np.array(test.encoded).reshape(len(test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    train_ohe = ohe.fit_transform(train_array)
    test_ohe = ohe.transform(test_array)
    
    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    train_encoded = pd.DataFrame(data=train_ohe,
                            columns=encoded_values, index=train.index)
    train = train.join(train_encoded)
    
    test_encoded = pd.DataFrame(data=test_ohe,
                               columns=encoded_values, index=test.index)
    test = test.join(test_encoded)
    
    return train, test

In [16]:
# train, test = encode(train, test, 'class')
train, test = encode(train, test, 'embark_town')

In [17]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Cherbourg,Queenstown,Southampton
288,1,2,male,42.0,0,0,13.0,Second,Southampton,1,0.0,0.0,1.0
706,1,2,female,45.0,0,0,13.5,Second,Southampton,1,0.0,0.0,1.0
342,0,2,male,28.0,0,0,13.0,Second,Southampton,1,0.0,0.0,1.0
491,0,3,male,21.0,0,0,7.25,Third,Southampton,1,0.0,0.0,1.0
403,0,3,male,28.0,1,0,15.85,Third,Southampton,0,0.0,0.0,1.0


In [18]:
def scale_minmax(train, test, column_list):
    scaler = MinMaxScaler()
    column_list_scaled = [col + '_scaled' for col in column_list]
    train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]), 
                                columns = column_list_scaled, 
                                index = train.index)
    train = train.join(train_scaled)

    test_scaled = pd.DataFrame(scaler.transform(test[column_list]), 
                                columns = column_list_scaled, 
                                index = test.index)
    test = test.join(test_scaled)
    
    return train, test

In [19]:
columns_to_scale = ['age', 'fare']
train, test = scale_minmax(train, test, columns_to_scale)

In [20]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Cherbourg,Queenstown,Southampton,age_scaled,fare_scaled
288,1,2,male,42.0,0,0,13.0,Second,Southampton,1,0.0,0.0,1.0,0.565099,0.025374
706,1,2,female,45.0,0,0,13.5,Second,Southampton,1,0.0,0.0,1.0,0.605871,0.02635
342,0,2,male,28.0,0,0,13.0,Second,Southampton,1,0.0,0.0,1.0,0.37483,0.025374
491,0,3,male,21.0,0,0,7.25,Third,Southampton,1,0.0,0.0,1.0,0.279696,0.014151
403,0,3,male,28.0,1,0,15.85,Third,Southampton,0,0.0,0.0,1.0,0.37483,0.030937


In [21]:
df, train, test, imputer, int_encoder, ohe, scaler = \
    prepare(df, drop_cols = ['passenger_id','embarked','deck'], 
            target = 'survived', train_prop=.80, seed=123, 
            impute_cols = ['embark_town'], impute_strategy='most_frequent',
            encode_col = 'embark_town', scale_cols = ['age','fare'])

KeyError: "['passenger_id' 'embarked' 'deck'] not found in axis"

## Hypothesis testing

### T-Test
#### Is age a driver of survival?
Use a t-test to compare the age of those who survived vs. those who did not.
Is there a significant difference? That is, is the average age of those who survived significantly different from those who did not? 

In [22]:
sp.stats.ttest_ind(
    train[train.survived == 1].age.dropna(),
    train[train.survived == 0].age.dropna())

Ttest_indResult(statistic=-1.9041575516042408, pvalue=0.05729363006322641)

#### Are the results of my t-test affected if I use the scaled values?

In [23]:
sp.stats.ttest_ind(
    train[train.survived == 1].age_scaled.dropna(),
    train[train.survived == 0].age_scaled.dropna())

Ttest_indResult(statistic=-1.9041575516042444, pvalue=0.05729363006322589)

#### Should I fill missing values in Age? 

Use the t-test to help decide...

When filling missing values in age with the median...does it make a difference? 
Should I fill that many missing values with a single value?
By comparing the results of the t-test we can see it does make a difference. 
This t-test turns out to not be significant due to the elevation of ages of some number of children being given an age of the median of all ages (32 years old +/-)


In [24]:
train, test = impute(train, test, my_strategy='median', column_list = ['age'])

In [25]:
sp.stats.ttest_ind(
    train[train.survived == 1].age.dropna(),
    train[train.survived == 0].age.dropna())

Ttest_indResult(statistic=-1.9041575516042408, pvalue=0.05729363006322641)

### Chi-Squared: Comparing 2 categorical variables

#### Is the location of embarkment a driver of survival? 

In [26]:
observed = pd.crosstab(train.embark_town, train.survived)
observed

survived,0,1
embark_town,Unnamed: 1_level_1,Unnamed: 2_level_1
Cherbourg,61,76
Queenstown,41,22
Southampton,337,175


In [27]:
chi2, p, degf, expected = sp.stats.chi2_contingency(observed)
print('chi2: ', chi2)
print('p-value: ', p)
print('degrees of freedom', degf)
print('expected values\n', expected)

chi2:  21.07355701451707
p-value:  2.6542097321482478e-05
degrees of freedom 2
expected values
 [[ 84.47050562  52.52949438]
 [ 38.84410112  24.15589888]
 [315.68539326 196.31460674]]


Use the function you defined in acquire.py to load the titanic data set.
Remove the deck (too many nulls) and embarked (directly correlated with embark_town) columns
Fill missing values with np.nan
Split data into test and train
Create a new column, is_child
Handle the missing values in embark_town by using the SimpleImputer
Handle the missing values in age by using the SimpleImputer
Use LabelEncoder to transform the sex column to integer encoded
Use OneHotEncoder to transform the sex column to one hot encoded
Scale age and fare using MinMaxScaler
Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [28]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Cherbourg,Queenstown,Southampton,age_scaled,fare_scaled
288,1,2,male,42.0,0,0,13.0,Second,Southampton,1,0.0,0.0,1.0,0.565099,0.025374
706,1,2,female,45.0,0,0,13.5,Second,Southampton,1,0.0,0.0,1.0,0.605871,0.02635
342,0,2,male,28.0,0,0,13.0,Second,Southampton,1,0.0,0.0,1.0,0.37483,0.025374
491,0,3,male,21.0,0,0,7.25,Third,Southampton,1,0.0,0.0,1.0,0.279696,0.014151
403,0,3,male,28.0,1,0,15.85,Third,Southampton,0,0.0,0.0,1.0,0.37483,0.030937


In [32]:
train[train.sibsp >= 2]['age'].value_counts(bins=5)

(0.947, 11.4]    20
(21.8, 32.2]     16
(11.4, 21.8]     12
(32.2, 42.6]      3
(42.6, 53.0]      2
Name: age, dtype: int64

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.hist(train[train.parch > 2]['age'])
plt.show()

plt.hist(train[train.sibsp >= 2]['age'])
plt.show()