In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


df_titanic = sns.load_dataset('titanic')
df_titanic.rename(columns={'fare': 'target'}, inplace=True)
df_titanic_num = df_titanic.select_dtypes(include=[np.number])

def generateMissingValues(df, missingRate):
    df = df.mask(np.random.random(df.shape) < missingRate)
    return df

# Imputation and Basic Pipelines

## Imputation

Until this point, if we are missing data in our dataset, we have simply dropped the rows with missing data. However, this is not always the best approach. In real world data analysis, we often need to deal with missing data without just losing everything. For example, if your company does a survey of customers to collect data, it is likely that at least some of the customers will not answer all the questions. Just dropping an entire row of data if one question is missing can be costly - getting this data takes effort and costs money, so we want to use it as much as possible.

Imputation is the process of replacing missing data with substituted values, or deleting it. There are several approaches, from dead simple to complex and advanced. 

### Simple Imputation

The tool that we can use to do imputation is the sklearn `SimpleImputer`. This works fairly simply, it takes in a dataset and replaces all missing values with a specified value. 

### Load Dataset

We'll load some data, and generate some missing values to play with. 

In [46]:
df = sns.load_dataset("titanic")
df.rename(columns={'fare': 'target'}, inplace=True)
y = df["target"]
df = generateMissingValues(df.drop(columns=["target"]), 0.1)
df["target"] = y
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone,target
0,0.0,3.0,male,22.0,1.0,0.0,S,Third,man,True,,Southampton,no,,7.25
1,1.0,1.0,female,,1.0,0.0,,First,woman,False,C,Cherbourg,yes,False,71.2833
2,1.0,3.0,female,26.0,0.0,0.0,S,Third,woman,False,,,yes,True,7.925
3,1.0,1.0,female,35.0,1.0,0.0,S,First,,False,C,Southampton,,False,53.1
4,0.0,,male,35.0,0.0,0.0,S,Third,man,True,,Southampton,no,True,8.05


### Removal

The most simple version of imputation is to just remove the rows with missing data. 

In [47]:
df_removed = df.dropna()
print(df_removed.shape)
df_removed.head()

(40, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone,target
27,0.0,1.0,male,19.0,3.0,2.0,S,First,man,True,C,Southampton,no,False,263.0
54,0.0,1.0,male,65.0,0.0,1.0,C,First,man,True,B,Cherbourg,no,False,61.9792
102,0.0,1.0,male,21.0,0.0,1.0,S,First,man,True,D,Southampton,no,False,77.2875
139,0.0,1.0,male,24.0,0.0,0.0,C,First,man,True,B,Cherbourg,no,True,79.2
151,1.0,1.0,female,22.0,1.0,0.0,S,First,woman,False,C,Southampton,yes,False,66.6


### Numerical Imputation

When we have numerical data, we can replace missing values with the mean, median, or mode of the column.

In [48]:
numeric_features = df.select_dtypes(include=[np.number]).columns
numeric_features = df[numeric_features]
print(numeric_features.shape)
print(numeric_features.isna().sum())
numeric_features.head()

(891, 6)
survived    102
pclass       65
age         244
sibsp        87
parch        84
target        0
dtype: int64


Unnamed: 0,survived,pclass,age,sibsp,parch,target
0,0.0,3.0,22.0,1.0,0.0,7.25
1,1.0,1.0,,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,,35.0,0.0,0.0,8.05


In [49]:
imputer = SimpleImputer(strategy="median")
median_df = imputer.fit_transform(numeric_features)
median_df = pd.DataFrame(median_df, columns=numeric_features.columns)
print(median_df.shape)
print(median_df.isna().sum())
median_df.head()

(891, 6)
survived    0
pclass      0
age         0
sibsp       0
parch       0
target      0
dtype: int64


Unnamed: 0,survived,pclass,age,sibsp,parch,target
0,0.0,3.0,22.0,1.0,0.0,7.25
1,1.0,1.0,28.0,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05


#### Mean vs Median

For most numerical values, we use either the mean or the median if we are inserting a value, at least for simple imputation. The distribution of a column can offer some guidance on which to use. 

One common example of this is with income/wealth related data, where we have a distribution that tends to be very skewed. If using skewed data, the median can often be a better choice than the mean for imputation, as the mean can be substantially impacted by some very large or very small values. This is very situation dependent, and you should always consider the specifics of your data.

### Category Imputation

When we have categorical data, we can replace missing values with the most common category, or a new category that represents missing data.

In [50]:
categorical_features = df.select_dtypes(include=[np.object]).columns
categorical_features = df[categorical_features]
print(categorical_features.shape)
print(categorical_features.isna().sum())
categorical_features.head()

(891, 7)
sex             96
embarked        96
who            101
adult_male      89
embark_town    101
alive           88
alone          108
dtype: int64


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features = df.select_dtypes(include=[np.object]).columns


Unnamed: 0,sex,embarked,who,adult_male,embark_town,alive,alone
0,male,S,man,True,Southampton,no,
1,female,,woman,False,Cherbourg,yes,False
2,female,S,woman,False,,yes,True
3,female,S,,False,Southampton,,False
4,male,S,man,True,Southampton,no,True


In [51]:
cat_imputer = SimpleImputer(strategy="most_frequent")
most_frequent_df = cat_imputer.fit_transform(categorical_features)
most_frequent_df = pd.DataFrame(most_frequent_df, columns=categorical_features.columns)
print(most_frequent_df.shape)
print(most_frequent_df.isna().sum())
most_frequent_df.head()

(891, 7)
sex            0
embarked       0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


Unnamed: 0,sex,embarked,who,adult_male,embark_town,alive,alone
0,male,S,man,True,Southampton,no,True
1,female,S,woman,False,Cherbourg,yes,False
2,female,S,woman,False,Southampton,yes,True
3,female,S,man,False,Southampton,no,False
4,male,S,man,True,Southampton,no,True


### Imputation Considerations


#### Removing vs Imputing

One common consideration if a feature has many missing values is whether to remove the feature or impute the missing values. If a feature has many missing values, it may not be useful for analysis. However, if the feature is important, it may be better to impute the missing values. There is a trade off here that will depend on the specific dataset and analysis. As a rule of thumb, if a feature has more than 15% missing values, it may be better to remove the feature.

In [52]:
missing_percent = df.isnull().mean() * 100
print(missing_percent)


survived       11.447811
pclass          7.295174
sex            10.774411
age            27.384961
sibsp           9.764310
parch           9.427609
embarked       10.774411
class           9.203143
who            11.335578
adult_male      9.988777
deck           79.349046
embark_town    11.335578
alive           9.876543
alone          12.121212
target          0.000000
dtype: float64


## Smarter Imputation

There are more advanced methods for imputation, such as using another machine learning models to predict the missing values. The idea here doesn't change, we have values that are missing, and we want to replace them with something that allows us to keep the value and use it in our analysis. The main thing that differs is that the simple imputation uses some pretty basic logic to replace the missing values, while the more advanced methods use more complex analysis to try to generate a better replacement for the missing value.

## Pipelines

Pipelines are a very useful tool because they make our lives much easier when doing some of these data preparation steps. A pipeline is a series of steps that are executed in order. This is useful because it allows us to do a series of steps in a single command, and it also allows us to do the same steps on multiple datasets. We will create a pipeline that contains multiple steps, then instead of using a model's .fit() method to train the model on our processed data, we'll use the .fit() method on the pipeline to train the model on our data after all those steps we define are done. We basically can add many of the preparation steps to the pipeline, then use it just as we'd use a model, and the pipeline will automatically scale, impute, or do whatever else we've defined in the pipeline.

![Pipeline](../images/pipe.png "Pipeline")

### Sklearn Pipelines

The pipeline tool that we use in sklearn to do this is the `Pipeline` tool. This tool allows us to define a series of steps, then use the pipeline as if it were a model. The creation and setup is fairly simple, we list the steps in a list of tuples, where the first element of the tuple is the name of the step, and the second element is the step itself. We then pass this list to the `Pipeline` tool, and we can use the pipeline as if it were a model.

In [53]:
y = np.array(numeric_features["target"]).reshape(-1, 1)
X = np.array(numeric_features.drop(columns=["target"]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,target
0,0.0,3.0,22.0,1.0,0.0,7.25
1,1.0,1.0,,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,,35.0,0.0,0.0,8.05


In [54]:
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler()),
    ("reg", LinearRegression())
])

pipe.fit(X_train, y_train)
print("Training Score: ", pipe.score(X_train, y_train))
print("Testing Score: ", pipe.score(X_test, y_test))

Training Score:  0.3591033766387626
Testing Score:  0.2752265648568497


### Column Transformer Introduction

One limitation of the pipeline tool is that it just does what it does to whatever you feed it. This is fine if you have a dataset that is all numerical, but if you have a dataset where we need to do encoding on some values and do scaling on some others, this isn't so simple. The `ColumnTransformer` tool is a tool that allows us to do different things to different columns. We can use this to do encoding on some columns and scaling on others, then pass this to the pipeline tool to do all of these steps in a single command.

The column transformer can be a little annoying to construct syntax-wise, but the idea is very simple - it is a pipeline that can define different routes for different columns in our data. 

![Column Transformer](../images/column-transformer.png "Column Transformer")

#### Sklearn Column Transformer

Using the `ColumnTransformer` tool is very similar to using the `Pipeline` tool. We define a list of tuples, where the first element of the tuple is the name of the step, and the second element is the step itself. We then pass this list to the `ColumnTransformer` tool, and we can use the column transformer as if it were a model.

<b>Note:</b> here we are using the column names, so I won't make the datasets into arrays. The pipeline, or the transformers inside of it, will do that for us automatically. 

In [55]:
y = df["target"]
X = df.drop(columns=["target"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone,target
0,0.0,3.0,male,22.0,1.0,0.0,S,Third,man,True,,Southampton,no,,7.25
1,1.0,1.0,female,,1.0,0.0,,First,woman,False,C,Cherbourg,yes,False,71.2833
2,1.0,3.0,female,26.0,0.0,0.0,S,Third,woman,False,,,yes,True,7.925
3,1.0,1.0,female,35.0,1.0,0.0,S,First,,False,C,Southampton,,False,53.1
4,0.0,,male,35.0,0.0,0.0,S,Third,man,True,,Southampton,no,True,8.05


#### Splitting Preparation

In this dataset we have a mix of numerical and categorical data. We will use the `ColumnTransformer` to do different things to different columns. We can define "lanes" of preparation that needs to take place, and then place each column into the correct lane. We need to:
<ul>
<li> Numerical data - impute missing values with the median, then scale the data. </li>
<li> Categorical data - impute missing values with the most common category, then encode the data. </li>
</ul>

Each of these can be their own pipeline, they will do their process just like a normal pipeline, they'll just be applied to a subset of the original columns. The column transformer definition will have 3 parts - the name of the step, the step itself, and the columns that the step will be applied to.

#### Column Transformer and Pipeline Structure

The `ColumnTransformer` and `Pipeline` tools can be assembled into a nested structure that can be arbitrarily complex. In real world scenarios where data is being gathered from multiple sources, is relatively dirty, and widely varied, this can allow developers to build a long chain of data preparation steps that can handle almost any scenario. For our purposes, we can stick to a much simpler template that will handle virtually every scenario we may see:
<ul>
<li> A pipeline for numeric data that scales the data and imputes missing values. </li>
<li> A pipeline for categorical data that imputes missing values and encodes the data. </li>
<li> A column tranformer that combines the two and assigns each column to the appropriate pipeline. </li>
<li> A final pipeline that combines the column transformer with a predictive model. This is what is fit to and used going forward. </li>
<li> <i> If there's an odd case such as some numerical values need mean imputing, and others need median, we can mirror one of the original pipelines and add another row in the column transformer to split the data correctly. </i> </li>
</ul>

In [56]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

ct = ColumnTransformer([
    ('num', num_pipe, ['age', 'pclass', 'sibsp', 'parch']),
    ('cat', cat_pipe, ['sex', 'class', 'embarked', 'who', 'deck', 'embark_town', 'alive', 'alone'])
])

final_pipe = Pipeline([
    ('preprocessing', ct),
    ('reg', LinearRegression())
])

#### Using the Column Transformer

Once the column transformer and its pipelines are created, they again work just like a normal model. 

In [57]:
final_pipe.fit(X_train, y_train)
print("Training Score: ", final_pipe.score(X_train, y_train))
print("Testing Score: ", final_pipe.score(X_test, y_test))

Training Score:  0.1988796116145607
Testing Score:  -0.255962426979945


## Using Pipelines in Practice

In real applications, these pipelines make our lives far easier. Pipelines and column transformers can be stacked, mixed, and combined to make a set of steps that can do almost anything we desire. Most commonly, we may have different steps for different columns, and we have a few parallel pipelines that do different things to different columns. For example, we may do imputation differently on different sets of numerical columns and do encoding differently on different sets of categorical columns. Then we may combine all 4+ sets of steps using the column transformer. 

The main advantage of this is that we can do all of our data preparation in a single command, and we can use the same steps on multiple datasets. This is very useful in real world data analysis, where we may have data coming in batches, and we need to do the same steps on each batch before making a prediction. Imagine a regression model that predicts an interest rate to assign based on risk when someone applies for a mortgage - we will have applications flowing in all the time, and the process will need to scale the incomes, one-hot encode the property type, etc... on each one. Having all of those preparation steps "built into" the model using pipelines allows this to happen automatically, with no additional work needed.

## Exercise

Predict the price of the diamond using all the features in the dataset.

<b>Note:</b> depending on your understanding of diamonds, some of the categories look like they may have some implicit order to them. Maybe capturing that in label encoding makes sense, maybe it doesn't. This would be a call that someone creating a model along with someone who knows diamonds would consider together, then test the results either way to evaluate. The simplest way to force order in label encoding is to use a dictionary to map the categories to numbers. 

```python
label_encoding_dict = {
    "cut": {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5},
    "color": {"J": 1, "I": 2, "H": 3, "G": 4, "F": 5, "E": 6, "D": 7},
    "clarity": {"I1": 1, "SI2": 2, "SI1": 3, "VS2": 4, "VS1": 5, "VVS2": 6, "VVS1": 7, "IF": 8}
}

df_dia.replace(label_encoding_dict, inplace=True)
```
In doing this, we would also think about the actual numerical values. If the range between them isn't one-by-one, we could change that. For example, maybe an "ideal" cut is better than a 5, we could score it a 7, so it would be "more better" than the other values. This is something that the diamond expert would need to weigh in on. When doing label encoding, you will want to scale these values as well, as they have a range that could be anything. 

In [58]:
df_dia = sns.load_dataset('diamonds')
price = df_dia["price"]
df_dia = generateMissingValues(df_dia.drop(columns=["price"]), 0.05)
df_dia["price"] = price
df_dia.sample(10)



Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
44810,0.5,Very Good,G,VS1,61.0,59.0,5.09,5.14,3.12,1624
51461,0.7,Premium,E,SI2,62.1,57.0,5.68,5.6,3.5,2376
14312,1.01,Fair,D,VS2,59.1,68.0,6.56,6.44,3.84,5797
22587,1.25,Premium,E,VS2,61.5,58.0,6.98,6.91,4.27,10640
36497,0.48,Ideal,I,VS1,62.2,54.0,5.03,5.06,3.14,944
50106,0.62,Ideal,E,VS2,62.0,54.5,5.46,5.48,3.39,2208
35592,0.31,Ideal,H,VVS1,62.2,54.0,4.39,4.36,2.72,907
33020,0.33,Ideal,E,VVS2,61.2,56.0,4.44,4.48,2.73,814
4790,1.0,Good,E,SI1,57.6,65.0,6.47,6.44,3.72,3696
26872,2.31,Very Good,I,SI1,62.5,55.0,8.43,8.5,5.29,16801


Training Score:  0.8703625760895936
Testing Score:  0.870026035204831


### More Exercise - For Funsies

Now that you have a trained model, let's predict some diamond prices. For this we'll need a few steps:
<ul>
<li> Create some data to predict on. </li>
    <ul>
    <li> We need to create a table with the same columns as the training data, other than the price. </li>
    <li> We need to create some rows of data to predict on, so make up some fake diamonds! Note that the categorical data needs to be from the classes in the original. </li>
    <li> The fake generation can also be done randomly, or with something like np.random.choice() to select from the original data. This is a fun practice exercise itself! </li>
    </ul>
<li> Use the generateMissingValues function that is near the top of this notebook to generate some missing values in the fake data. </li>
<li> Use the pipeline's predict function call to predict the price of the diamonds. </li>
<li> Capture the predictions and add them to the incoming data from your price-free diamonds, as the predicted_price column. </li>
</ul>

<b>If you happen to get an implausible price, such as a negative price, that may well be a valid prediction - why do you think that happened?</b>

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.25,Ideal,H,VS1,60.3,,5.17,4.4,2.69
1,0.5,Premium,G,VS2,59.5,57.0,,6.6,4.14
2,0.79,Premium,D,VS2,62.7,54.1,8.47,,3.63
3,1.11,Good,J,I1,62.1,57.0,5.62,6.93,2.59
4,1.02,Very Good,H,VVS2,62.3,57.0,6.1,6.93,4.55
