# Processing Numerical Data

In [45]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts
Uh-oh.  utils.py is no longer in this directory.  How do we get to it?  Simple make utils a package and import it. See course website '41_create_an_importable_package.pdf' in week 4.

In [46]:
# I need to add the parent directory of utils in order to find it
#it happens to be up 1 directory.
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

In [47]:
import utils as ut
df = ut.generate_tshirt_order()
df

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,78.030017,small,green,Joseph Sims,13
1,102.979420,small,black,Dale Ohmen,12
2,110.402883,small,black,Norma Swirsky,12
3,96.922537,small,black,Sherice Lane,16
4,109.718481,small,red,Katie Solomons,8
...,...,...,...,...,...
295,208.220984,large,orange,Latonya Robinson,11
296,170.343806,large,green,Lynn Mendez,9
297,151.321078,large,black,Niesha Turner,17
298,140.950670,large,orange,Savannah Pipkin,17


## Use dtypes to see the kinds of processing we will need

In [48]:
df.dtypes

weight           float64
t_shirt_size      object
t_shirt_color     object
name              object
Age                int64
dtype: object

In [20]:
df.nunique()

weight           300
t_shirt_size       3
t_shirt_color      5
name             300
Age               10
dtype: int64

## weight and age are numerical, lets go ahead and scale them
first using min-max encoding<br>
And then using standardization<br><br>

In [34]:
def transform(df, numerical_features, scaler):
    '''
    copies original dataframe
    transforms numerical_features using the provided scaler
    returns df
    '''
    df1 = df.copy()
    df1[numerical_features] = scaler.fit_transform(df1[numerical_features])
    return df1

### min max scaler

In [49]:
from sklearn.preprocessing import MinMaxScaler

#scaler requires all columns that it operates on to be numerical
df1 = transform(df, ['weight','Age'], MinMaxScaler())
df1

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,0.101368,small,green,Joseph Sims,0.555556
1,0.223195,small,black,Dale Ohmen,0.444444
2,0.259443,small,black,Norma Swirsky,0.444444
3,0.193619,small,black,Sherice Lane,0.888889
4,0.256101,small,red,Katie Solomons,0.000000
...,...,...,...,...,...
295,0.737083,large,orange,Latonya Robinson,0.333333
296,0.552131,large,green,Lynn Mendez,0.111111
297,0.459244,large,black,Niesha Turner,1.000000
298,0.408606,large,orange,Savannah Pipkin,1.000000


In [50]:
df1.describe()

Unnamed: 0,weight,Age
count,300.0,300.0
mean,0.401849,0.510741
std,0.197323,0.328165
min,0.0,0.0
25%,0.240458,0.222222
50%,0.393741,0.555556
75%,0.543706,0.777778
max,1.0,1.0


### standard scaler (subtract mean/divide by std)

In [51]:
from sklearn.preprocessing import StandardScaler

df2 = transform(df, ['weight','Age'], StandardScaler())
df2

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,-1.525328,small,green,Joseph Sims,0.136790
1,-0.906901,small,black,Dale Ohmen,-0.202359
2,-0.722893,small,black,Norma Swirsky,-0.202359
3,-1.057034,small,black,Sherice Lane,1.154237
4,-0.739858,small,red,Katie Solomons,-1.558955
...,...,...,...,...,...
295,1.701751,large,orange,Latonya Robinson,-0.541508
296,0.762879,large,green,Lynn Mendez,-1.219806
297,0.291357,large,black,Niesha Turner,1.493386
298,0.034303,large,orange,Savannah Pipkin,1.493386


In [52]:
df2.describe()

Unnamed: 0,weight,Age
count,300.0,300.0
mean,2.942091e-17,3.123427e-16
std,1.001671,1.001671
min,-2.039905,-1.558955
25%,-0.8192663,-0.8806567
50%,-0.04115702,0.1367901
75%,0.7201105,0.8150879
max,3.036394,1.493386
