In [1]:
import pandas as pd
import numpy as np
import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing

import acquire

In [2]:
df = acquire.get_titanic_data()
print('%d rows and %d columns' % df.shape)
df.head()

891 rows and 13 columns


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


- are class and pclass the same?
- are emarked and emabark_town the same?
- are the 1s and 0s in survived booleans? alone?
- dataframe index vs passenger_id?
- what does the distribution of fair look like?
- In deck, is None null? missing?

These are questions we'll save for exploration:

- what's the relationships between sibsp and alone?
- what's the relationship between survived and alone?

In [3]:
df.isna().mean()

passenger_id    0.000000
survived        0.000000
pclass          0.000000
sex             0.000000
age             0.198653
sibsp           0.000000
parch           0.000000
fare            0.000000
embarked        0.002245
class           0.000000
deck            0.772166
embark_town     0.002245
alone           0.000000
dtype: float64

In [4]:
df = df.drop(columns='deck')

In [5]:
pd.crosstab(df.pclass, df['class'])

class,First,Second,Third
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,216,0,0
2,0,184,0
3,0,0,491


takeaway: let's just use one of them

In [6]:
df = df.drop(columns='class')

In [7]:
pd.crosstab(df.embark_town, df.embarked)

embarked,C,Q,S
embark_town,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cherbourg,168,0,0
Queenstown,0,77,0
Southampton,0,0,644


In [8]:
df = df.drop(columns='embarked')

In [9]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,Southampton,1


Remaining data issues:

- age has a bunch of missing values
- pclass is class encoded
- emark_town has a couple missing values
- embark_town is a string, how do we represent this?

In [10]:
train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8)

In [11]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
329,329,1,1,female,16.0,0,1,57.9792,Cherbourg,0
749,749,0,3,male,31.0,0,0,7.7500,Queenstown,1
203,203,0,3,male,45.5,0,0,7.2250,Cherbourg,1
421,421,0,3,male,21.0,0,0,7.7333,Queenstown,1
97,97,1,1,male,23.0,0,1,63.3583,Cherbourg,0
...,...,...,...,...,...,...,...,...,...,...
98,98,1,2,female,34.0,0,1,23.0000,Southampton,0
322,322,1,2,female,30.0,0,0,12.3500,Queenstown,1
382,382,0,3,male,32.0,0,0,7.9250,Southampton,1
365,365,0,3,male,30.0,0,0,7.2500,Southampton,1


In [12]:
# How many values are in each subgroup?
train.groupby(['pclass', 'sex', 'embark_town']).size()

pclass  sex     embark_town
1       female  Cherbourg       31
                Queenstown       1
                Southampton     36
        male    Cherbourg       34
                Queenstown       1
                Southampton     61
2       female  Cherbourg        7
                Queenstown       1
                Southampton     54
        male    Cherbourg        8
                Queenstown       1
                Southampton     78
3       female  Cherbourg       16
                Queenstown      31
                Southampton     72
        male    Cherbourg       32
                Queenstown      32
                Southampton    214
dtype: int64

In [13]:
# applying a custom aggregation function
# how many null values are in these subgroups?
train.groupby(['pclass', 'sex', 'embark_town']).agg(lambda s: s.isna().sum())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,passenger_id,survived,age,sibsp,parch,fare,alone
pclass,sex,embark_town,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,female,Cherbourg,0,0,2.0,0,0,0.0,0
1,female,Queenstown,0,0,0.0,0,0,0.0,0
1,female,Southampton,0,0,4.0,0,0,0.0,0
1,male,Cherbourg,0,0,6.0,0,0,0.0,0
1,male,Queenstown,0,0,0.0,0,0,0.0,0
1,male,Southampton,0,0,8.0,0,0,0.0,0
2,female,Cherbourg,0,0,0.0,0,0,0.0,0
2,female,Queenstown,0,0,0.0,0,0,0.0,0
2,female,Southampton,0,0,1.0,0,0,0.0,0
2,male,Cherbourg,0,0,1.0,0,0,0.0,0


In [14]:
train.age.isna().sum()

148

**impute** - to fill in missing values

Strategies for imputing:

- fill with 0
- fill with the average
- fill with the median
- fill with subgroup mean
- build a model to predict missing values

In [15]:
# fill with 0
# train.age = train.age.fillna(0)

For filling with the overall average, there's two steps:

1. Find the average (from the *training* data)
2. Fill the missing values in train and test

Two ways to make this happen:

1. "manually" with pandas
2. scikit-learn

In [16]:
# manually with pandas

avg_age = train.age.mean()
train.age = train.age.fillna(avg_age)
test.age = test.age.fillna(avg_age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [17]:
# Using sklearn

# 1. make the thing
imputer = sklearn.impute.SimpleImputer(strategy='mean')

# 2. fit the thing
imputer.fit(train[['age']])

# 3. use the thing
train.age = imputer.transform(train[['age']])
test.age = imputer.transform(test[['age']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [18]:
train.embark_town.isna().sum()

2

In [19]:
train.embark_town.value_counts()

Southampton    515
Cherbourg      128
Queenstown      67
Name: embark_town, dtype: int64

In [20]:
train.embark_town = train.embark_town.fillna('Southampton')
test.embark_town = test.embark_town.fillna('Southampton')

**encoding** -- turning a string into a number

two strategies:

- associate each unique value with a number -- label encoding
- one-hot encoding: turn each unique value into a seperate column with either 1 or 0
    - curse of dimensionality

When to use one or the other?

- use the label encoder when the categories have an inherit order
- use one-hot encoding when there is no order

In [21]:
encoder = sklearn.preprocessing.OneHotEncoder()

encoder.fit(train[['embark_town']])

# the one hot encoder gives us a special data structure called a sparse matrix
# (a matrix with more 0s than other values)
# .todense to convert from sparse matrix to plain old 2d numpy
m = encoder.transform(train[['embark_town']]).todense()
m

matrix([[1., 0., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.]])

In [22]:
encoder.categories_

[array(['Cherbourg', 'Queenstown', 'Southampton'], dtype=object)]

In [23]:
# use pd.concat when the indexes are the same
pd.concat([
    train.embark_town,
    pd.DataFrame(m, columns=encoder.categories_[0], index=train.index)
], axis=1)

Unnamed: 0,embark_town,Cherbourg,Queenstown,Southampton
329,Cherbourg,1.0,0.0,0.0
749,Queenstown,0.0,1.0,0.0
203,Cherbourg,1.0,0.0,0.0
421,Queenstown,0.0,1.0,0.0
97,Cherbourg,1.0,0.0,0.0
...,...,...,...,...
98,Southampton,0.0,0.0,1.0
322,Queenstown,0.0,1.0,0.0
382,Southampton,0.0,0.0,1.0
365,Southampton,0.0,0.0,1.0


In [24]:
# there will only be a single 1 in all of the produced columns
(pd.DataFrame(m, columns=encoder.categories_[0]).sum(axis=1) == 1).all()

True

In [25]:
# bringing it all together, we'll one hot encode embark_town
# and then add those one-hot encoded columns back to our training and test dataframes
encoder = sklearn.preprocessing.OneHotEncoder()
encoder.fit(train[['embark_town']])

# nice columns for display
cols = ['embark_town_' + c for c in encoder.categories_[0]]

m = encoder.transform(train[['embark_town']]).todense()
train = pd.concat([
    train,
    pd.DataFrame(m, columns=cols, index=train.index)
], axis=1).drop(columns='embark_town')

m = encoder.transform(test[['embark_town']]).todense()
test = pd.concat([
    test,
    pd.DataFrame(m, columns=cols, index=test.index)
], axis=1).drop(columns='embark_town')

In [28]:
def drop_columns(df):
    return df.drop(columns=[
        'deck',     # too many missing values
        'class',    # same as pclass
        'embarked', # same as embarked_town
    ])

def impute_age(train, test):
    imputer = sklearn.impute.SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])
    return train, test

def impute_embark_town(train, test):
    train.embark_town = train.embark_town.fillna('Southampton')
    test.embark_town = test.embark_town.fillna('Southampton')
    return train, test

def encode_embark_town(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder()
    encoder.fit(train[['embark_town']])
    # nice columns for display
    cols = ['embark_town_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['embark_town']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='embark_town')
    
    m = encoder.transform(test[['embark_town']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='embark_town')

    return train, test

def prep_titanic_data(df):
    df = drop_columns(df)
    train, test = sklearn.model_selection.train_test_split(df, train_size=.8, random_state=123)
    train, test = impute_age(train, test)
    train, test = impute_embark_town(train, test)
    train, test = encode_embark_town(train, test)
    
    return train, test

In [29]:
df = acquire.get_titanic_data()
train, test = prep_titanic_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
