<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

# Encoding categorical variables
In this example we are using data from a health insurance company regarding their customers and the annual medical expenses for each customer.  The objective is to develop a model which can predict the medical expenses of a customer based on demographic information about the customer.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.formula.api import ols

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Run this before any other code cell
# This downloads the csv data files into the same directory where you have saved this notebook

import urllib.request
from pathlib import Path
import os
path = Path()

# Dictionary of file names and download links
files = {'insurance_modified.csv':'https://storage.googleapis.com/aipi_datasets/insurance_modified.csv'}

# Download each file
for key,value in files.items():
    filename = path/key
    url = value
    # If the file does not already exist in the directory, download it
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url,filename)

In [None]:
# Read in the data
data = pd.read_csv('insurance_modified.csv')
data = data
data.head()

In [None]:
# Create feature data and target
X = data.drop('charges',axis=1)
y = data['charges']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print("Shape of X_train, y_train:",X_train.shape,y_train.shape)
print("Shape of X_test, y_test:",X_test.shape,y_test.shape)

## Encode training set
### Ordinal encode 'sex', 'smoker', 'age_group'

In [None]:
def ordinal_encode(X,cols):
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    # Fit the encoder on training data and transform it.  We can also use it to transform test data
    X[cols] = enc.fit_transform(X[cols])
    return X,enc

In [None]:
# Ordinal encode the binary variables (sex, smoker) and the ordinal variable (age_group)
X_train_encoded = X_train.copy()
ordinal_cols = ['sex','smoker','age_group']

X_train_encoded,ordinal_enc = ordinal_encode(X_train_encoded,ordinal_cols)
X_train_encoded.head()

### One-hot encode 'region' and 'children'

In [None]:
def onehot_encode(X,cols):
    # Treat new categories as a new 'unknown' category (all onehot columns are 0)
    onehot_enc = OneHotEncoder(handle_unknown='ignore')
    # Fit encoder on training data
    onehot_enc.fit(X[cols])
    # Get the names of the new columns created
    colnames = list(onehot_enc.get_feature_names(input_features=cols))
    # Transform the data
    onehot_vals = onehot_enc.transform(X[cols]).toarray()
    # Put transformed data into dataframe
    enc_df = pd.DataFrame(onehot_vals,columns=colnames,index=X.index)
    # Add onehot columns back onto original dataframe and drop the original columns
    X = pd.concat([X,enc_df],axis=1).drop(cols,axis=1)
    return X,onehot_enc

In [None]:
# Features to one-hot encode
onehotcols = ['region','children']
X_train_encoded, onehot_enc = onehot_encode(X_train_encoded,onehotcols)

X_train_encoded.head()

## Encode test set
### Your turn
Complete the below function `encode_test_set()`, which takes as input the test set, the list of ordinal columns `ordinal_cols`, the list of columns to one-hot encode `onehot_cols`, the fitted ordinal encoder and the fitted one-hot encoder.  The function should use the ordinal encoder to encode the data in the columns listed in `ordinal_cols` and use the one-hot encoder to encode the data in the columns in `onehot_cols`.  The function should then return the test dataset with the columns encoded.  Note: don't forget to drop the original columns in `onehot_cols` once you have one-hot encoded them.

In [None]:
def encode_test_set(X,ordinal_cols,onehot_cols,ordinal_encoder,onehot_encoder):
    '''
    Encodes the test set using ordinal and one-hot encoding

    Inputs:
        X(pd.DataFrame): test set inputs
        ordinal_cols(list): list of columns to ordinal encode
        onehot_cols(list): list of columns to one-hot encode
        ordinal_encoder(sklearn.preprocessing._encoders.OrdinalEncoder): ordinal encoder fit on the training data
        onehot_encoder(sklearn.preprocessing._encoders.OneHotEncoder): onehot encoder fit on the training data

    Returns:
        X(pd.DataFrame): test set with categorical features encoded
    '''
    ### BEGIN SOLUTION ###
    
    
    ### END SOLUTION ###

In [None]:
X_test_encoded = X_test.copy()
X_test_encoded = encode_test_set(X_test_encoded,ordinal_cols,onehotcols,ordinal_enc,onehot_enc)
display(X_test_encoded.head())

assert X_test_encoded.shape==(268, 14)

### Run model

In [None]:
model = LinearRegression()
model.fit(X_train_encoded,y_train)
testpreds = model.predict(X_test_encoded)
r2 = r2_score(y_test,testpreds)
print("The model's R-squared value on the training set is {:.3f}".format(r2))

assert np.round(r2,2)==0.80