# Box-Cox transformation: preprocessing.PowerTransformer(method='box-cox')
* sklearn help: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
* concept: https://blog.minitab.com/blog/applying-statistics-in-quality-projects/how-could-you-benefit-from-a-box-cox-transformation

* author: Prasert Kanawattanachai
* e-mail: prasert.k@chula.ac.th
* Chulalongkorn Business School, Thailand

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
print(f'pandas  version = {pd.__version__}')
print(f'numpy   version = {np.__version__}')
print(f'seaborn version = {sns.__version__}')

# Boston House Prices
https://www.kaggle.com/vikrishnan/boston-house-prices

* CRIM     per capita crime rate by town
* ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS    proportion of non-retail business acres per town
* CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* NOX      nitric oxides concentration (parts per 10 million)
* RM       average number of rooms per dwelling
* AGE      proportion of owner-occupied units built prior to 1940
* DIS      weighted distances to five Boston employment centres
* RAD      index of accessibility to radial highways
* TAX      full-value property-tax rate per \$10,000
* PTRATIO  pupil-teacher ratio by town
* B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* LSTAT    % lower status of the population
* MEDV     Median value of owner-occupied homes in $1000's

In [None]:
df=pd.read_csv('https://github.com/prasertcbs/basic-dataset/raw/master/boston_house_price.csv')
df.head()

In [None]:
df.info()

In [None]:
df.hist(figsize=(15, 10));

## sklearn preprocessing.PowerTransformer()

In [None]:
from sklearn import preprocessing

In [None]:
df.columns

In [None]:
pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) # support only positive value
cols=['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

# pt = preprocessing.PowerTransformer(method='yeo-johnson', standardize=False) # support 0, negative and positive values
# cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
mat = pt.fit_transform(df[cols])
mat[:5]

In [None]:
bc_cols = [f'bc_{c}' for c in cols]
bc_cols

In [None]:
ds=pd.concat([df, pd.DataFrame(mat, columns=bc_cols)], axis='columns')
ds.head()

In [None]:
ds[cols].hist(layout=(2, 6), figsize=(15, 4));

In [None]:
ds[bc_cols].hist(layout=(2, 6), figsize=(15, 4), color='orange', alpha=.5);