In this notebook, I use the training and test sets created from the previous notebook to matematically transform them.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import scipy.stats as stats

%matplotlib inline

In [None]:
X_train_OCE = pd.read_csv('X_train_OCE.csv', index_col=[0])
X_test_OCE = pd.read_csv('X_test_OCE.csv', index_col=[0])

X_train_MCE = pd.read_csv('X_train_MCE.csv', index_col=[0])
X_test_MCE = pd.read_csv('X_test_MCE.csv', index_col=[0])

In [None]:
pd.set_option('display.max_columns', 100)
X_train_OCE.describe()

In [None]:
X_train_MCE.describe()

I have dealth with missing values, outliers, rare categories and encoding. Now, I will tackle separate classes of variables to:
 - transform those that are necessary.
 - discretize others.

In [None]:
numeric_cols_from_start = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold']
#numeric_cols_from_start

In [None]:
#separate some chuncks of variables and plot them
X_train_OCE[['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF']].hist(figsize=(20,20),
                                                                                   bins=20)
plt.show()
#variables to benefit from normalization:
# 1stFlrSF, LotFrontage, BsmtUnfSF, LotArea, OverallCond, OverallQual, TotalBsmtSF

In [None]:
X_train_OCE[['LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt']].hist(figsize=(20,20),
                                                                                   bins=20)
plt.show()
#GrLivArea, TotRmsAbvGrd

In [None]:
X_train_OCE[['GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']].hist(figsize=(20,20),
                                                                                   bins=20)
plt.show()
# GarageArea, MoSold

In [None]:
#pull variables to transform into a list
to_trans = ['LotArea', 'LotFrontage', '1stFlrSF', 'BsmtUnfSF', 'OverallCond', 
            'OverallQual', 'TotalBsmtSF', 'GrLivArea', 'TotRmsAbvGrd',
            'GarageArea', 'MoSold']

# 1stFlrSF, LotFrontage, BsmtUnfSF, LotArea, OverallCond, OverallQual, TotalBsmtSF #GrLivArea, TotRmsAbvGrd
# GarageArea, MoSold

In [None]:
from feature_engine import variable_transformers as vt
#try the expodenntial transforme
rt = vt.PowerTransformer(variables = ['LotArea', 'LotFrontage', '1stFlrSF', 'BsmtUnfSF', 'OverallCond', 
            'OverallQual', 'TotalBsmtSF', 'GrLivArea', 'TotRmsAbvGrd',
            'GarageArea', 'MoSold'])
rt.fit(X_train_OCE)

In [None]:
X_train_OCE = rt.transform(X_train_OCE)

In [None]:
#round the variables in the transformed dataset
X_train_OCE[to_trans] = X_train_OCE[to_trans].round(4) 

In [None]:
rt.fit(X_train_MCE)
X_train_MCE = rt.transform(X_train_MCE)

In [None]:
# plot the histograms to have a quick look at the variable distribution
# histogram and Q-Q plots

def diagnostic_plots(df, variable):
    
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].hist(bins=30)
    plt.title(variable)

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.title(variable)
   

    plt.show() #make sure that at the end of the function the plots are displayed

In [None]:
for col in to_trans:
    diagnostic_plots(X_train_OCE,col)

In [None]:
X_train_OCE = rt.transform(X_train_OCE)
X_test_OCE = rt.transform(X_test_OCE)

X_train_MCE = rt.transform(X_train_MCE)
X_test_MCE = rt.transform(X_test_MCE)

In [None]:
X_train_OCE.to_csv('X_train_OCE_4.csv')
X_test_OCE.to_csv('X_test_OCE_4.csv')

X_train_MCE.to_csv('X_train_MCE_4.csv')
X_test_MCE.to_csv('X_test_MCE_4.csv')

In [None]:
X_test_MCE.isnull().sum().sort_values(ascending=False)