# Featuring Enginnering Using Scikit-Learn

### Do You Want to Build a Snowman?

Let's prep some data for a model to predict the height of snowman!

<img src = "./images/olaf.jpeg">

#### Load Packages

In [94]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    KBinsDiscretizer,
    PolynomialFeatures
)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

#### Load Data

In [95]:
data = {
    'temp' : [-3, 5, 0, 7, 3, -1, 1, None, -6, 3, 0, -1, None, -2],
    'lunch' : ['soup', 'sandwich', 'soup', 'burger', 'sandwich', 'soup', 'cereal', 'salad', 'sandwich', 'burger', 'soup', 'cereal', 'burger', 'soup'],
    'dinner' : ['pizza', 'pizza', 'noodles', None, 'fishsticks', 'pizza', None, 'fishsticks', 'noodles', 'pizza', None, 'pizza', 'fishsticks', 'pizza'],
    'precipitation' : ['yes', 'no', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no'],
    'height_snowman_cm' : [100, 0, 75, 0, 20, 25, 0, 35, 170, 0, 85, 85, 45, 0]
}

df_train = pd.DataFrame(data=data)
df_train

Unnamed: 0,temp,lunch,dinner,precipitation,height_snowman_cm
0,-3.0,soup,pizza,yes,100
1,5.0,sandwich,pizza,no,0
2,0.0,soup,noodles,yes,75
3,7.0,burger,,yes,0
4,3.0,sandwich,fishsticks,yes,20
5,-1.0,soup,pizza,yes,25
6,1.0,cereal,,no,0
7,,salad,fishsticks,yes,35
8,-6.0,sandwich,noodles,yes,170
9,3.0,burger,pizza,no,0


#### Exercise
Transform the data above using scikit-learn tool in a way that is suitable to be used for modeling
+ **Separate the DataFrame `df_train` into `X_train` and `y_train`** 
   +  Our target variable is `height_snowman_cm`
+ **Preprocess `X_train`**:
  + Identify which variables are **binary**, **categorical** and  **numeric**
  + Check which variables have **missing values**
    + **Impute missing value** as needed using appropriate strategy
  + Determine if categorical variables have **non-numeric values**
    + **Encode categorical variables** using techniques such as one-hot encoding
  + Determine if numeric variables are on different scale
    + **Scale numeric variables**
+ **Create `X_train_fe`**:
    + Once the preprocessing steps are completed, compile the transformed columns into a new DataFrame called `X_train_fe`. 


In [96]:
X_train = df_train.drop(columns=['height_snowman_cm'])
y_train = df_train['height_snowman_cm']
X_train

Unnamed: 0,temp,lunch,dinner,precipitation
0,-3.0,soup,pizza,yes
1,5.0,sandwich,pizza,no
2,0.0,soup,noodles,yes
3,7.0,burger,,yes
4,3.0,sandwich,fishsticks,yes
5,-1.0,soup,pizza,yes
6,1.0,cereal,,no
7,,salad,fishsticks,yes
8,-6.0,sandwich,noodles,yes
9,3.0,burger,pizza,no


In [97]:
#X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [98]:
df_train.shape
# 14 entries

(14, 5)

In [99]:
df_train

Unnamed: 0,temp,lunch,dinner,precipitation,height_snowman_cm
0,-3.0,soup,pizza,yes,100
1,5.0,sandwich,pizza,no,0
2,0.0,soup,noodles,yes,75
3,7.0,burger,,yes,0
4,3.0,sandwich,fishsticks,yes,20
5,-1.0,soup,pizza,yes,25
6,1.0,cereal,,no,0
7,,salad,fishsticks,yes,35
8,-6.0,sandwich,noodles,yes,170
9,3.0,burger,pizza,no,0


In [100]:
df_train.lunch.unique()

array(['soup', 'sandwich', 'burger', 'cereal', 'salad'], dtype=object)

In [101]:
df_train.dinner.unique()

array(['pizza', 'noodles', None, 'fishsticks'], dtype=object)

In [102]:
df_train.precipitation.unique()

array(['yes', 'no'], dtype=object)

In [103]:
df_train.temp.unique()

array([-3.,  5.,  0.,  7.,  3., -1.,  1., nan, -6., -2.])

In [104]:
# Categorial: lunch, dinner
# Binairy: precipitation
# Numerical: temp


In [105]:
df_train.isnull().sum()

temp                 2
lunch                0
dinner               3
precipitation        0
height_snowman_cm    0
dtype: int64

In [106]:
# temp has 2 null values
# dinner has 3 null values

In [107]:
# Imputing missing values for temp. It's a numerical value, let's take the mean = 1
X_train['temp'].mean()

np.float64(0.5)

In [108]:
# Instantiating a SimpleImputer object
temp_imputer = SimpleImputer(strategy='mean').set_output(transform='pandas')
temp_imputer.fit(X_train[['temp']])
temp_imputer_df_train = temp_imputer.transform(X_train[['temp']])
# No more nulls
temp_imputer_df_train.isna().sum()

temp    0
dtype: int64

In [None]:
# Imputing missing values for dinner. It's a categorial value, let's take the most frequent: pizza
X_train['dinner'].value_counts(dropna=False)

dinner
pizza         6
None          3
fishsticks    3
noodles       2
Name: count, dtype: int64

In [119]:
dinner_imputer = SimpleImputer(strategy='most_frequent', missing_values=None).set_output(transform='pandas')
dinner_imputer.fit(X_train[['dinner']])
dinner_imputer_df_train = dinner_imputer.transform(X_train[['dinner']])
dinner_imputer_df_train.isna().sum()

dinner    0
dtype: int64

In [111]:
dinner_imputer_df_train

Unnamed: 0,dinner
0,pizza
1,pizza
2,noodles
3,pizza
4,fishsticks
5,pizza
6,pizza
7,fishsticks
8,noodles
9,pizza


In [None]:
# Encoding categorical values for lunch and precipitation
subset_categorical = ['lunch', 'precipitation']
subset_encoder = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')
subset_encoder.fit(X_train[subset_categorical])
subset_encoder.categories_


[array(['burger', 'cereal', 'salad', 'sandwich', 'soup'], dtype=object),
 array(['no', 'yes'], dtype=object)]

In [121]:
subset_encoder.get_feature_names_out()

array(['lunch_cereal', 'lunch_salad', 'lunch_sandwich', 'lunch_soup',
       'precipitation_yes'], dtype=object)

In [122]:
subset_encoded_train = subset_encoder.transform(X_train[subset_categorical])
subset_encoded_train

Unnamed: 0,lunch_cereal,lunch_salad,lunch_sandwich,lunch_soup,precipitation_yes
0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0
5,0.0,0.0,0.0,1.0,1.0
6,1.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,1.0
8,0.0,0.0,1.0,0.0,1.0
9,0.0,0.0,0.0,0.0,0.0


In [126]:
# Encoding categorical values for dinner
dinner_encoder = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')
dinner_encoder.fit(dinner_imputer_df_train[['dinner']])
dinner_encoder.categories_

[array(['fishsticks', 'noodles', 'pizza'], dtype=object)]

In [129]:
dinner_encoded_train = dinner_encoder.transform(dinner_imputer_df_train[['dinner']])
dinner_encoded_train

Unnamed: 0,dinner_noodles,dinner_pizza
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,0.0,0.0
5,0.0,1.0
6,0.0,1.0
7,0.0,0.0
8,1.0,0.0
9,0.0,1.0


In [130]:
subset_encoded_train

Unnamed: 0,lunch_cereal,lunch_salad,lunch_sandwich,lunch_soup,precipitation_yes
0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0
5,0.0,0.0,0.0,1.0,1.0
6,1.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,1.0
8,0.0,0.0,1.0,0.0,1.0
9,0.0,0.0,0.0,0.0,0.0


In [131]:
temp_imputer_df_train

Unnamed: 0,temp
0,-3.0
1,5.0
2,0.0
3,7.0
4,3.0
5,-1.0
6,1.0
7,0.5
8,-6.0
9,3.0


In [117]:
df_train

Unnamed: 0,temp,lunch,dinner,precipitation,height_snowman_cm
0,-3.0,soup,pizza,yes,100
1,5.0,sandwich,pizza,no,0
2,0.0,soup,noodles,yes,75
3,7.0,burger,,yes,0
4,3.0,sandwich,fishsticks,yes,20
5,-1.0,soup,pizza,yes,25
6,1.0,cereal,,no,0
7,,salad,fishsticks,yes,35
8,-6.0,sandwich,noodles,yes,170
9,3.0,burger,pizza,no,0


In [135]:
df_train_fe = pd.concat([temp_imputer_df_train,dinner_encoded_train,subset_encoded_train],axis=1)
df_train_fe

Unnamed: 0,temp,dinner_noodles,dinner_pizza,lunch_cereal,lunch_salad,lunch_sandwich,lunch_soup,precipitation_yes
0,-3.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1,5.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
3,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
5,-1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
6,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,-6.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
9,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
