# Processing Numerical Data

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts
Uh-oh.  utils.py is no longer in this directory.  How do we get to it?  Simple make utils a package and import it. See website for 'how-to'

In [17]:
# I need to add the parent directory of utils in order to find it
#it happens to be up 1 directory.
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

In [18]:
import utils as ut
df = ut.generate_tshirt_order()
df

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,106.874778,small,black,Michael Morgan,14
1,121.341363,small,red,Joyce Robinson,14
2,124.555691,small,blue,Patrick Wells,15
3,102.067367,small,green,Lorene Ocampo,17
4,76.726329,small,blue,Gregory Moran,8
...,...,...,...,...,...
295,153.036868,large,red,Rosemary Conte,15
296,168.330535,large,red,Joel White,11
297,180.081728,large,blue,Renea Baxter,8
298,181.614071,large,blue,Dennis Steinmetz,8


## Use dtypes to see the kinds of processing we will need

In [19]:
df.dtypes

weight           float64
t_shirt_size      object
t_shirt_color     object
name              object
Age                int64
dtype: object

In [20]:
df.nunique()

weight           300
t_shirt_size       3
t_shirt_color      5
name             300
Age               10
dtype: int64

## weight and age are numerical, lets go ahead and scale them
first using min-max encoding<br>
And then using standardization<br><br>

In [34]:
def transform(df, numerical_features, scaler):
    '''
    copies original dataframe
    transforms numerical_features using the provided scaler
    returns df
    '''
    df1 = df.copy()
    df1[numerical_features] = scaler.fit_transform(df1[numerical_features])
    return df1

### min max scaler

In [41]:
from sklearn.preprocessing import MinMaxScaler

#scaler requires all columns that it operates on to be numerical
df1 = transform(df, ['weight','Age'], MinMaxScaler())
df1

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,0.233271,small,black,Michael Morgan,0.666667
1,0.310511,small,red,Joyce Robinson,0.666667
2,0.327673,small,blue,Patrick Wells,0.777778
3,0.207603,small,green,Lorene Ocampo,1.000000
4,0.072300,small,blue,Gregory Moran,0.000000
...,...,...,...,...,...
295,0.479742,large,red,Rosemary Conte,0.777778
296,0.561399,large,red,Joel White,0.333333
297,0.624141,large,blue,Renea Baxter,0.000000
298,0.632323,large,blue,Dennis Steinmetz,0.000000


In [42]:
df1.describe()

Unnamed: 0,weight,Age
count,300.0,300.0
mean,0.415189,0.493704
std,0.205968,0.320774
min,0.0,0.0
25%,0.251617,0.222222
50%,0.387131,0.555556
75%,0.573036,0.777778
max,1.0,1.0


### standard scaler (subtract mean/divide by std)

In [39]:
from sklearn.preprocessing import StandardScaler

df2 = transform(df, ['weight','Age'], StandardScaler())
df2

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,-0.884711,small,black,Michael Morgan,0.540106
1,-0.509071,small,red,Joyce Robinson,0.540106
2,-0.425608,small,blue,Patrick Wells,0.887069
3,-1.009540,small,green,Lorene Ocampo,1.580996
4,-1.667547,small,blue,Gregory Moran,-1.541673
...,...,...,...,...,...
295,0.313936,large,red,Rosemary Conte,0.887069
296,0.711052,large,red,Joel White,-0.500784
297,1.016184,large,blue,Renea Baxter,-1.541673
298,1.055972,large,blue,Dennis Steinmetz,-1.541673


In [43]:
df2.describe()

Unnamed: 0,weight,Age
count,300.0,300.0
mean,-4.834096e-16,-1.887379e-17
std,1.001671,1.001671
min,-2.01916,-1.541673
25%,-0.795486,-0.8477467
50%,-0.1364508,0.1931428
75%,0.767649,0.8870692
max,2.844073,1.580996
