In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydataset
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# Scaling

Scaling changes the range of features in our dataset.

0. tldr

    - Use a min-max scaler before you do modeling.
    - Generally prefer unscaled data, except in modeling.
    - Learn parameters for scaling from the training split.

1. Scaling - when, where, what, why, and how

    - why
        - some model types can be thrown off by different feature scales
        - improves most model's implementation
        - visualize the combination of 2 variables with different scales
        - a better interpretation of the data (e.g. log scaling)
        - combining features
    - when
        - data prep / exploration
        - pipeline: prep
        - lifecycle: prep/exploration
        - when one of the conditions above is met. Otherwise, it's better to work with the original units
    - where
        - the training dataset
        - usually just the independent variables
        - indep vars are scaled independently, i.e. the scaling of one feature doesn't affect the scaling of another
        - scale whatever goes into the model
    - how
        - `sklearn.preprocessing` -- requires 2d array
        - make the thing, fit the thing, use the thing
        - `.fit` to learn parameters, `.transform` to apply the scaling
        - seperate scaled dataframes and/or columns

## Why Scale? A Motivating Example

In [None]:
train = pd.read_csv('https://gist.githubusercontent.com/zgulde/66989745314d2c68ab62fae13743f094/raw/71635c6281b5e2a36e3eb4578cab277eb09743ec/train.csv')
test = pd.read_csv('https://gist.githubusercontent.com/zgulde/66989745314d2c68ab62fae13743f094/raw/71635c6281b5e2a36e3eb4578cab277eb09743ec/test.csv')
print('train shape: %d x %d' % train.shape)
print('test shape: %d x %d' % test.shape)
train.head()

In [None]:
X_train, X_test = train[['pints', 'n_sprinkles']], test[['pints', 'n_sprinkles']]
y_train, y_test = train.flavor, test.flavor

In [None]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
model.fit(X_train_scaled, y_train)
model.score(X_test_scaled, y_test)

What's going on?

### Another Example

In [None]:
df = pydataset.data('sat.act')
df.head()

In [None]:
df[['gender', 'ACT', 'SATV', 'SATQ']].groupby('gender').mean().plot.bar(figsize=(11, 6), ec='black', width=.9)

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

cols = ['education', 'age', 'ACT', 'SATQ', 'SATV']
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])
df[['gender', 'ACT', 'SATV', 'SATQ']].groupby('gender').mean().plot.bar(figsize=(11, 6), ec='black', width=.95)

## Linear Scaling

- Units are changed, but the distance between points is preserved.

- MinMax: everything between 0 and 1

    $$ x' = \frac{x - \text{max}(x)}{\text{max}(x) - \text{min}(x)} $$

- Standard: a zscore, standard deviations from the mean, **center** + **scale**

    $$ x' = \frac{x - \bar{x}}{s_x} $$

    - **centering**: subtracting the mean
    - **scaling**: dividing by the standard deviation

- Robust: robust to and preserves outliers

    $$ x' = \frac{x - \text{med}(x)}{\text{IQR}_x} $$

In [None]:
scaling_example = pd.DataFrame()
scaling_example['x1'] = np.arange(1, 11)
scaling_example['x2'] = [-100, -1, 0, 1, 2, 3, 4, 5, 100, 1000]

scaler = MinMaxScaler()
scaling_example[['x1_minmax', 'x2_minmax']] = scaler.fit_transform(scaling_example[['x1', 'x2']])

scaler = StandardScaler()
scaling_example[['x1_standard', 'x2_standard']] = scaler.fit_transform(scaling_example[['x1', 'x2']])

scaler = RobustScaler()
scaling_example[['x1_robust', 'x2_robust']] = scaler.fit_transform(scaling_example[['x1', 'x2']])

In [None]:
scaling_example[sorted(scaling_example)] # sort columns alphabetically

## Non-linear Scaling

- The distance between points is **not** preserved, but order is
- Not as common as linear scalers
- In sklearn: power transformation: box-cox, yeo-johnson; quantile transformation
- Log

    $$ x' = \log_b{x} $$

    $$ b^{x'} = x $$

    Sometimes you can just set the x/y scale w/ matplotlib instead of
    actually transforming the data

In [None]:
np.random.seed(1)
n = 100

df = pd.DataFrame()
df['x1'] = np.random.randn(n)
df['x2'] = 10 ** (df.x1 + np.random.randn(n) * .5)

fig, ax = plt.subplots(figsize=(16, 6))
ax.scatter(df.x1, df.x2)

## Futher Reading

[Visual Demos](https://stats-demos.zach.wiki/)