In [1]:
## Importing necessary libaries

import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [2]:
## 1. Using the pandas library to read the csv data file and create a data-frames called insurance

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'insurance.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

insurance = pd.read_csv(file_content_stream)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
## 2. Changing sex, smoker and region from labels to dummy variables.

## Changing labels to numbers
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)

## Extracting region dummies
region_dummies = pd.get_dummies(insurance['region']).iloc[:, 0:3]

## Appending dummies
insurance = pd.concat([insurance, region_dummies], axis = 1)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [4]:
## 3.  Engineer the interactions/features from Chapter 4 lecture notes (the ones from the decision tree)

## Feature engineering:
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) & (insurance['age'] <= 32.5), 1, 0)

insurance['interaction_2'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 32.5) & (insurance['age'] <= 44.5), 1, 0)

insurance['interaction_3'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 44.5) & (insurance['age'] < 51.5), 1, 0)

insurance['interaction_4'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 51.5), 1, 0)

In [5]:
## 4. Based on the feature selection analysis shown in Chapter 4, it seems that age, bmi, children, smoker, and interaction 4 
## are the top 5 important variables. Using the top variables as input variables and charges as the target variable to split the 
## data into three datasets: train (80%), validation (10%) and test (10%)

## Defining input and target variables
X = insurance[['age','bmi', 'children', 'smoker', 'interaction_4']]
Y = insurance['charges']

## Splitting the data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.5)

## Changing the scale of the inputs
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)
X_test = scaler.fit_transform(X_test)

In [7]:
## 5. Using train data-frame and the top 5 features to perform a hyper-tuning job on the random forest model using the 
## GridSearchCV function and the following dictionary:

RF_param_grid = {'n_estimators': [100, 300, 500], 'min_samples_split': [10, 15], 'min_samples_leaf': [5, 7], 'max_depth': [3, 5, 7]}

## Running GridSearchCV with three folds
rf_grid_search = GridSearchCV(RandomForestRegressor(), RF_param_grid, cv = 3, scoring = 'neg_mean_squared_error', 
                             n_jobs = -1).fit(X_train, Y_train)

## Printing the best hyper-parameters
print('Best hyper-parameter combination:', rf_grid_search.best_params_)

## Extracting the optimal model
rf_md = rf_grid_search.best_estimator_

## Using the optimal model to predict the charges on the validation and test set
rf_val_preds = rf_md.predict(X_val)
rf_test_preds = rf_md.predict(X_test)

## Computing the mean squared error of the predictions 
rf_val_mse = mean_squared_error(Y_val, rf_val_preds)
rf_test_mse = mean_squared_error(Y_test, rf_test_preds)

print('\nMean Square Error on the validation set:', round(rf_val_mse))
print('Mean Square Error on the testing set:', round(rf_test_mse))

Best hyper-parameter combination: {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 100}

Mean Square Error on the validation set: 14338492
Mean Square Error on the testing set: 23572859


In [8]:
## 6. Using train data-frame and the top 5 features to perform a hyper-tuning job on the support vector machine model using the 
## GridSearchCV function and the following dictionary:

SVM_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.01, 0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

## Running GridSearchCV with three folds
svm_grid_search = GridSearchCV(SVR(), SVM_param_grid, cv = 3, scoring = 'neg_mean_squared_error', 
                             n_jobs = -1).fit(X_train, Y_train)

## Printing the best hyper-parameters
print('Best hyper-parameter combination:',svm_grid_search.best_params_)

## Extracting the optimal model
svm_md = svm_grid_search.best_estimator_

## Using the optimal model to predict the charges on the validation and test set
svm_val_preds = svm_md.predict(X_val)
svm_test_preds = svm_md.predict(X_test)

## Computing the mean squared error of the predictions 
svm_val_mse = mean_squared_error(Y_val, svm_val_preds)
svm_test_mse = mean_squared_error(Y_test, svm_test_preds)

print('\nMean Square Error on the validation set:', round(svm_val_mse))
print('Mean Square Error on the testing set:', round(svm_test_mse))

Best hyper-parameter combination: {'C': 10, 'gamma': 1, 'kernel': 'poly'}

Mean Square Error on the validation set: 80785897
Mean Square Error on the testing set: 102938451


In [9]:
## 7. Using the predictions on the validation data-frame from parts 5 & 6 to build an ensemble model (using the random forest model)

## Building the ensemble data set
X_ensemble = pd.concat([pd.DataFrame(rf_val_preds), pd.DataFrame(svm_val_preds)], axis = 1)

## Performing a hyper-parameter tuning job on the ensemble model (using the same set of hyper-parameters from part 5)
ensemble_grid_search = GridSearchCV(RandomForestRegressor(), RF_param_grid, cv = 3, scoring = 'neg_mean_squared_error', 
                             n_jobs = -1).fit(X_ensemble, Y_val)

## Printing the best hyper-parameters
print('Best hyper-parameter combination:', ensemble_grid_search.best_params_)

## Identifying the model that produces the minimum squared error
rf_md_ensemble = ensemble_grid_search.best_estimator_

## Using the ensemble model to predict charges on the test data-frame
rf_ensemble_preds = rf_md_ensemble.predict(pd.concat([pd.DataFrame(rf_test_preds), pd.DataFrame(svm_test_preds)], axis = 1))

## Computing the mean squared error of the predictions on the test data-frame
rf_ensemble_mse = mean_squared_error(Y_test, rf_ensemble_preds)
print('\nMean Square Error on the testing set using the ensemble model:', round(rf_ensemble_mse))

Best hyper-parameter combination: {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}

Mean Square Error on the testing set using the ensemble model: 27002154


## 8. Results:

Based on the results from parts 5, 6, and 7, we would use the Random Forest Classifier model with the following hyper-parameters: 'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 100. This model had a slightly better MSE than the ensemble model.