# 5. Encoding Strings

## Summary of Commands

In [19]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from mymetrics import root_mean_squared_log_error

hs = pd.read_csv('data/housing_sample.csv')
X = hs[['YearBuilt', 'GrLivArea', 'GarageArea', 'LotFrontage']].values
y = hs.pop('SalePrice').values

kf = KFold(n_splits=5, shuffle=True)

si = SimpleImputer(strategy='mean')
ss = StandardScaler()
ridge = Ridge()

steps = [('impute', si), ('standardize', ss), ('ridge', ridge)]
pipe = Pipeline(steps)

grid = {'impute__strategy': ['mean', 'median'],
        'ridge__alpha': np.logspace(-5, 5)}
gs = GridSearchCV(estimator=pipe, param_grid=grid, 
                  cv=kf, scoring=root_mean_squared_log_error)
gs.fit(X, y)
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_impute__strategy,param_ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.002045,0.000088,0.000690,0.000099,mean,1e-05,"{'impute__strategy': 'mean', 'ridge__alpha': 1...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,78,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
1,0.002071,0.000199,0.000617,0.000020,mean,1.59986e-05,"{'impute__strategy': 'mean', 'ridge__alpha': 1...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,77,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
2,0.002172,0.000283,0.000651,0.000040,mean,2.55955e-05,"{'impute__strategy': 'mean', 'ridge__alpha': 2...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,76,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
3,0.001858,0.000062,0.000612,0.000047,mean,4.09492e-05,"{'impute__strategy': 'mean', 'ridge__alpha': 4...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,75,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
4,0.001879,0.000101,0.000607,0.000016,mean,6.55129e-05,"{'impute__strategy': 'mean', 'ridge__alpha': 6...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,74,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
5,0.001999,0.000243,0.000656,0.000088,mean,0.000104811,"{'impute__strategy': 'mean', 'ridge__alpha': 0...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,73,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
6,0.002124,0.000303,0.000670,0.000135,mean,0.000167683,"{'impute__strategy': 'mean', 'ridge__alpha': 0...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,72,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
7,0.002019,0.000165,0.000657,0.000117,mean,0.00026827,"{'impute__strategy': 'mean', 'ridge__alpha': 0...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,71,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
8,0.002028,0.000263,0.000639,0.000061,mean,0.000429193,"{'impute__strategy': 'mean', 'ridge__alpha': 0...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,70,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764
9,0.001988,0.000252,0.000639,0.000043,mean,0.000686649,"{'impute__strategy': 'mean', 'ridge__alpha': 0...",-0.236353,-0.226634,-0.223048,-0.201107,-0.241703,-0.225769,0.014016,69,-0.222036,-0.230278,-0.226459,-0.228973,-0.217325,-0.225014,0.004764


## String vs Numeric columns
Thus far, we haven't looked at any of the string columns. This is because string columns are not allowed to be passed into machine learning estimators in scikit-learn. They must be encoded to numeric first. Two common ways of encoding strings to numeric are one-hot and ordinal. 

### One-hot encoding
One-hot encoding is useful whenever there is no inherent order of the string values such as we see in the neighborhood column. The `OneHotEncoding` transformer is found in the `preprocessing` module.

In [2]:
hs.head()

Unnamed: 0,Neighborhood,Exterior1st,YearBuilt,LotFrontage,GrLivArea,GarageArea
0,CollgCr,VinylSd,2003,65.0,1710,548
1,Other,Other,1976,80.0,1262,460
2,CollgCr,VinylSd,2001,68.0,1786,608
3,Other,Other,1915,60.0,1717,642
4,Other,VinylSd,2000,84.0,2198,836


### Big upgrade to `OneHotEncoder` in 0.20
The `OneHotEncoder` was given a huge upgrade in version 0.20. Previously it was only able to handle numeric values.

### Using `OneHotEncoder`
It transforms each column into several new columns where the number of new columns is equal to the number of unique values in the original column. Each row will contain all 0's except for the a single column, which will be 1, to represent that string. Let's see an example of how it works with dummy data. Note that we must instantiate it with `sparse` set to `False` in order to get a visualize the array on our screens.

In [3]:
a = np.array([['chevy'], ['ford'], ['chevy'], ['chrysler'], ['mercedes']])
a

array([['chevy'],
       ['ford'],
       ['chevy'],
       ['chrysler'],
       ['mercedes']], dtype='<U8')

In [4]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(a)

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

scikit-learn provides the method `get_feature_names` to get the name of each column.

In [5]:
ohe.get_feature_names()

array(['x0_chevy', 'x0_chrysler', 'x0_ford', 'x0_mercedes'], dtype=object)

### Values not in the training set
If you wish to use this same encoding to transform another column, an error will be raised if one of the values did not appear in the training set as happens below with 'toyota'.

In [6]:
b = np.array([['ford'], ['chrysler'], ['toyota']])
b

array([['ford'],
       ['chrysler'],
       ['toyota']], dtype='<U8')

In [7]:
ohe.transform(b) # error

ValueError: Found unknown categories ['toyota'] in column 0 during transform

### Handle Unknown
As a workaround, you can instantiate `OneHotEncoder` by setting `handle_unknown` to 'ignore'. The unknown value will be encoded as a row of all 0's.

In [8]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit_transform(a)

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

The last row is encoded as all 0's.

In [9]:
ohe.transform(b)

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.]])

## One-hot encode the housing dataset
Let's use the `OneHotEncoder` on both of the string columns in our dataset. To do so we will need to fill in the missing values and do so with the string 'MISSING'. We use a pipeline with the `SimpleImputer` to

In [10]:
hs.isna().sum()

Neighborhood     14
Exterior1st      43
YearBuilt         0
LotFrontage     259
GrLivArea         0
GarageArea        0
dtype: int64

In [11]:
X = hs[['Neighborhood', 'Exterior1st']].values
X[:5]

array([['CollgCr', 'VinylSd'],
       ['Other', 'Other'],
       ['CollgCr', 'VinylSd'],
       ['Other', 'Other'],
       ['Other', 'VinylSd']], dtype=object)

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

si = SimpleImputer(strategy='constant', fill_value='MISSING')
ohe = OneHotEncoder(sparse=False)
steps = [('impute', si), ('encode', ohe)]

pipe = Pipeline(steps)
X1 = pipe.fit_transform(X)
X1[:5]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 1.]])

In [13]:
pipe.named_steps['encode'].get_feature_names()

array(['x0_CollgCr', 'x0_Edwards', 'x0_MISSING', 'x0_NAmes', 'x0_OldTown',
       'x0_Other', 'x1_HdBoard', 'x1_MISSING', 'x1_Other', 'x1_VinylSd'],
      dtype=object)

## Exercise
Read in the housing_original.csv file to get the full housing dataset with many more variables. Select a subset of the string variables to encode. Use a pipeline to fill missing values if need be.

## Extra - OrdinalEncoder
A different encoding strategy is to use a single column and encode each string with a different number beginning at 0. This makes sense if the strings have a natural ordering such as descriptive feedback ratings (very bad, bad, neutral, good, very good) or the color of a diamond. If there is no natural ordering then this method would arbitrarily rank one string greater than another.

Below is a simple example with some dummy data.

In [14]:
a = np.array([['good'], ['very good'], ['good'], ['bad'], ['very bad']], dtype='object')
a

array([['good'],
       ['very good'],
       ['good'],
       ['bad'],
       ['very bad']], dtype=object)

In [17]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['very bad', 'bad','neutral', 'good', 'very good']])
oe.fit_transform(a)

array([[3.],
       [4.],
       [3.],
       [1.],
       [0.]])