In [129]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


df_titanic = sns.load_dataset('titanic')
df_titanic.rename(columns={'fare': 'target'}, inplace=True)
df_titanic_num = df_titanic.select_dtypes(include=[np.number])

def generateMissingValues(df, missingRate):
    df = df.mask(np.random.random(df.shape) < missingRate)
    return df

# Imputation and Basic Pipelines

## Imputation

Until this point, if we are missing data in our dataset, we have simply dropped the rows with missing data. However, this is not always the best approach. In real world data analysis, we often need to deal with missing data without just losing everything. For example, if your company does a survey of customers to collect data, it is likely that at least some of the customers will not answer all the questions. Just dropping an entire row of data if one question is missing can be costly - getting this data takes effort and costs money, so we want to use it as much as possible.

Imputation is the process of replacing missing data with substituted values, or deleting it. There are several approaches, from dead simple to complex and advanced. 

### Simple Imputation

The tool that we can use to do imputation is the sklearn `SimpleImputer`. This works fairly simply, it takes in a dataset and replaces all missing values with a specified value. 

### Load Dataset

We'll load some data, and generate some missing values to play with. 

In [130]:
df = sns.load_dataset("titanic")
df.rename(columns={'fare': 'target'}, inplace=True)
y = df["target"]
df = generateMissingValues(df.drop(columns=["target"]), 0.1)
df["target"] = y
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone,target
0,0.0,3.0,male,22.0,,0.0,S,Third,man,True,,Southampton,no,False,7.25
1,1.0,1.0,female,38.0,,0.0,,First,woman,,,Cherbourg,yes,False,71.2833
2,1.0,3.0,female,26.0,0.0,0.0,S,,woman,False,,Southampton,yes,True,7.925
3,1.0,1.0,female,35.0,1.0,0.0,S,First,,False,C,Southampton,yes,False,53.1
4,0.0,3.0,male,35.0,0.0,0.0,S,Third,man,True,,Southampton,no,True,8.05


### Removal

The most simple version of imputation is to just remove the rows with missing data. 

In [131]:
df_removed = df.dropna()
print(df_removed.shape)
df_removed.head()

(37, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone,target
10,1.0,3.0,female,4.0,1.0,1.0,S,Third,child,False,G,Southampton,yes,False,16.7
62,0.0,1.0,male,45.0,1.0,0.0,S,First,man,True,C,Southampton,no,False,83.475
75,0.0,3.0,male,25.0,0.0,0.0,S,Third,man,True,F,Southampton,no,True,7.65
88,1.0,1.0,female,23.0,3.0,2.0,S,First,woman,False,C,Southampton,yes,False,263.0
96,0.0,1.0,male,71.0,0.0,0.0,C,First,man,True,A,Cherbourg,no,True,34.6542


### Numerical Imputation

When we have numerical data, we can replace missing values with the mean, median, or mode of the column.

In [132]:
numeric_features = df.select_dtypes(include=[np.number]).columns
numeric_features = df[numeric_features]
print(numeric_features.shape)
print(numeric_features.isna().sum())
numeric_features.head()

(891, 6)
survived     97
pclass       91
age         247
sibsp        92
parch        85
target        0
dtype: int64


Unnamed: 0,survived,pclass,age,sibsp,parch,target
0,0.0,3.0,22.0,,0.0,7.25
1,1.0,1.0,38.0,,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05


In [133]:
imputer = SimpleImputer(strategy="median")
median_df = imputer.fit_transform(numeric_features)
median_df = pd.DataFrame(median_df, columns=numeric_features.columns)
print(median_df.shape)
print(median_df.isna().sum())
median_df.head()

(891, 6)
survived    0
pclass      0
age         0
sibsp       0
parch       0
target      0
dtype: int64


Unnamed: 0,survived,pclass,age,sibsp,parch,target
0,0.0,3.0,22.0,0.0,0.0,7.25
1,1.0,1.0,38.0,0.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05


#### Mean vs Median

For most numerical values, we use either the mean or the median if we are inserting a value, at least for simple imputation. The distribution of a column can offer some guidance on which to use. 

One common example of this is with income/wealth related data, where we have a distribution that tends to be very skewed. If using skewed data, the median can often be a better choice than the mean for imputation, as the mean can be substantially impacted by some very large or very small values. This is very situation dependent, and you should always consider the specifics of your data.

### Category Imputation

When we have categorical data, we can replace missing values with the most common category, or a new category that represents missing data.

In [134]:
categorical_features = df.select_dtypes(include=[np.object]).columns
categorical_features = df[categorical_features]
print(categorical_features.shape)
print(categorical_features.isna().sum())
categorical_features.head()

(891, 7)
sex             80
embarked        89
who             81
adult_male      84
embark_town     88
alive          104
alone           91
dtype: int64


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features = df.select_dtypes(include=[np.object]).columns


Unnamed: 0,sex,embarked,who,adult_male,embark_town,alive,alone
0,male,S,man,True,Southampton,no,False
1,female,,woman,,Cherbourg,yes,False
2,female,S,woman,False,Southampton,yes,True
3,female,S,,False,Southampton,yes,False
4,male,S,man,True,Southampton,no,True


In [135]:
cat_imputer = SimpleImputer(strategy="most_frequent")
most_frequent_df = cat_imputer.fit_transform(categorical_features)
most_frequent_df = pd.DataFrame(most_frequent_df, columns=categorical_features.columns)
print(most_frequent_df.shape)
print(most_frequent_df.isna().sum())
most_frequent_df.head()

(891, 7)
sex            0
embarked       0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


Unnamed: 0,sex,embarked,who,adult_male,embark_town,alive,alone
0,male,S,man,True,Southampton,no,False
1,female,S,woman,True,Cherbourg,yes,False
2,female,S,woman,False,Southampton,yes,True
3,female,S,man,False,Southampton,yes,False
4,male,S,man,True,Southampton,no,True


### Imputation Considerations


#### Removing vs Imputing

One common consideration if a feature has many missing values is whether to remove the feature or impute the missing values. If a feature has many missing values, it may not be useful for analysis. However, if the feature is important, it may be better to impute the missing values. There is a trade off here that will depend on the specific dataset and analysis. As a rule of thumb, if a feature has more than 15% missing values, it may be better to remove the feature.

In [136]:
missing_percent = df.isnull().mean() * 100
print(missing_percent)


survived       10.886644
pclass         10.213244
sex             8.978676
age            27.721661
sibsp          10.325477
parch           9.539843
embarked        9.988777
class           9.652076
who             9.090909
adult_male      9.427609
deck           79.236813
embark_town     9.876543
alive          11.672278
alone          10.213244
target          0.000000
dtype: float64


## Smarter Imputation

There are more advanced methods for imputation, such as using another machine learning models to predict the missing values. The idea here doesn't change, we have values that are missing, and we want to replace them with something that allows us to keep the value and use it in our analysis. The main thing that differs is that the simple imputation uses some pretty basic logic to replace the missing values, while the more advanced methods use more complex analysis to try to generate a better replacement for the missing value.

## Pipelines

Pipelines are a very useful tool because they make our lives much easier when doing some of these data preparation steps. A pipeline is a series of steps that are executed in order. This is useful because it allows us to do a series of steps in a single command, and it also allows us to do the same steps on multiple datasets. We will create a pipeline that contains multiple steps, then instead of using a model's .fit() method to train the model on our processed data, we'll use the .fit() method on the pipeline to train the model on our data after all those steps we define are done. We basically can add many of the preparation steps to the pipeline, then use it just as we'd use a model, and the pipeline will automatically scale, impute, or do whatever else we've defined in the pipeline.

![Pipeline](../images/pipe.png "Pipeline")

### Sklearn Pipelines

The pipeline tool that we use in sklearn to do this is the `Pipeline` tool. This tool allows us to define a series of steps, then use the pipeline as if it were a model. The creation and setup is fairly simple, we list the steps in a list of tuples, where the first element of the tuple is the name of the step, and the second element is the step itself. We then pass this list to the `Pipeline` tool, and we can use the pipeline as if it were a model.

In [137]:
y = np.array(numeric_features["target"]).reshape(-1, 1)
X = np.array(numeric_features.drop(columns=["target"]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,target
0,0.0,3.0,22.0,,0.0,7.25
1,1.0,1.0,38.0,,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05


In [138]:
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler()),
    ("reg", LinearRegression())
])

pipe.fit(X_train, y_train)
print("Training Score: ", pipe.score(X_train, y_train))
print("Testing Score: ", pipe.score(X_test, y_test))

Training Score:  0.3270433428544871
Testing Score:  0.3943159948300071


### Column Transformer Introduction

One limitation of the pipeline tool is that it just does what it does to whatever you feed it. This is fine if you have a dataset that is all numerical, but if you have a dataset where we need to do encoding on some values and do scaling on some others, this isn't so simple. The `ColumnTransformer` tool is a tool that allows us to do different things to different columns. We can use this to do encoding on some columns and scaling on others, then pass this to the pipeline tool to do all of these steps in a single command.

The column transformer can be a little annoying to construct syntax-wise, but the idea is very simple - it is a pipeline that can define different routes for different columns in our data. 

![Column Transformer](../images/column-transformer.png "Column Transformer")

#### Sklearn Column Transformer

Using the `ColumnTransformer` tool is very similar to using the `Pipeline` tool. We define a list of tuples, where the first element of the tuple is the name of the step, and the second element is the step itself. We then pass this list to the `ColumnTransformer` tool, and we can use the column transformer as if it were a model.

<b>Note:</b> here we are using the column names, so I won't make the datasets into arrays. The pipeline, or the transformers inside of it, will do that for us automatically. 

In [139]:
y = df["target"]
X = df.drop(columns=["target"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone,target
0,0.0,3.0,male,22.0,,0.0,S,Third,man,True,,Southampton,no,False,7.25
1,1.0,1.0,female,38.0,,0.0,,First,woman,,,Cherbourg,yes,False,71.2833
2,1.0,3.0,female,26.0,0.0,0.0,S,,woman,False,,Southampton,yes,True,7.925
3,1.0,1.0,female,35.0,1.0,0.0,S,First,,False,C,Southampton,yes,False,53.1
4,0.0,3.0,male,35.0,0.0,0.0,S,Third,man,True,,Southampton,no,True,8.05


##### Splitting Preparation

In this dataset we have a mix of numerical and categorical data. We will use the `ColumnTransformer` to do different things to different columns. We can define "lanes" of preparation that needs to take place, and then place each column into the correct lane. We need to:
<ul>
<li> Numerical data - impute missing values with the median, then scale the data. </li>
<li> Categorical data - impute missing values with the most common category, then encode the data. </li>
</ul>

Each of these can be their own pipeline, they will do their process just like a normal pipeline, they'll just be applied to a subset of the original columns. The column transformer definition will have 3 parts - the name of the step, the step itself, and the columns that the step will be applied to.

In [140]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

ct = ColumnTransformer([
    ('num', num_pipe, ['age', 'pclass', 'sibsp', 'parch']),
    ('cat', cat_pipe, ['sex', 'class', 'embarked', 'who', 'deck', 'embark_town', 'alive', 'alone'])
])

final_pipe = Pipeline([
    ('preprocessing', ct),
    ('reg', LinearRegression())
])

#### Using the Column Transformer

Once the column transformer and its pipelines are created, they again work just like a normal model. 

In [141]:
final_pipe.fit(X_train, y_train)
print("Training Score: ", final_pipe.score(X_train, y_train))
print("Testing Score: ", final_pipe.score(X_test, y_test))

Training Score:  0.4322266446294176
Testing Score:  0.46465003035713437


## Using Pipelines in Practice

In real applications, these pipelines make our lives far easier. Pipelines and column transformers can be stacked, mixed, and combined to make a set of steps that can do almost anything we desire. Most commonly, we may have different steps for different columns, and we have a few parallel pipelines that do different things to different columns. For example, we may do imputation differently on different sets of numerical columns and do encoding differently on different sets of categorical columns. Then we may combine all 4+ sets of steps using the column transformer. 

## Exercise

Predict the price of the diamond using all the features in the dataset.

In [142]:
df_dia = sns.load_dataset('diamonds')
price = df_dia["price"]
df_dia = generateMissingValues(df_dia.drop(columns=["price"]), 0.05)
df_dia["price"] = price
df_dia.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


In [143]:
dia_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
dia_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])
dia_ct = ColumnTransformer([
    ('num', dia_numeric, ['carat', 'depth', 'table', 'x', 'y', 'z']),
    ('cat', dia_categorical, ['cut', 'color', 'clarity'])
])
dia_pipe = Pipeline([
    ('preprocessing', dia_ct),
    ('reg', LinearRegression())
])

y = df_dia["price"]
X = df_dia.drop(columns=["price"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dia_pipe.fit(X_train, y_train)
print("Training Score: ", dia_pipe.score(X_train, y_train))
print("Testing Score: ", dia_pipe.score(X_test, y_test))

Training Score:  0.866618395289798
Testing Score:  0.8751900780675863
