# Data Preprocessing
Handle missing values, encode categoricals and scale features.

In [1]:

import pandas as pd
from sklearn.impute import SimpleImputer  # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
from sklearn.preprocessing import OneHotEncoder  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.compose import ColumnTransformer  # https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
from sklearn.pipeline import Pipeline  # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html


In [2]:

df = pd.read_csv('../data/house_prices.csv')
num_features = df.columns[:-1]
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features)
])
preprocessed = preprocessor.fit_transform(df)
preprocessed[:5]


array([[-1.34584062, -1.03274708,  2.2987294 ,  0.13847038, -1.29692062,
        -0.69390585,  0.09128125,  0.81322727],
       [ 0.09943212, -0.84653693,  1.88566917,  0.31005175, -1.040968  ,
        -2.05324327,  1.0209649 , -0.3394965 ],
       [ 1.29460853, -1.58351741, -0.8919957 ,  0.46642202, -0.19383109,
         0.93945847,  1.62750294, -0.6013311 ],
       [-2.89366289,  1.06117721, -1.53040308, -0.47697408,  0.52949315,
        -2.39344398, -2.37626618,  1.28708642],
       [-0.35915543, -0.61821494,  0.86101297, -0.74514579, -0.07295679,
         1.74747041,  2.05084179, -0.60203537]])

## Exercises
- Try `MinMaxScaler` instead of `StandardScaler`.
