In [1]:
## Importing necessary libaries

import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import cost_function as cf
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, make_scorer

In [2]:
## 1. Using the pandas library to read the csv data files and create three data-frames called train,
## validation and test, respectively.

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'turnover_train.csv'
file_key2 = 'turnover_val.csv'
file_key3 = 'turnover_test.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)
bucket_object3 = bucket.Object(file_key3)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()
file_object3 = bucket_object3.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')
file_content_stream3 = file_object3.get('Body')

train = pd.read_csv(file_content_stream)
val = pd.read_csv(file_content_stream2)
test = pd.read_csv(file_content_stream3)

train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary,left
0,0.92,0.95,6,239,4,0,0,sales,medium,0
1,0.88,0.89,4,254,5,0,0,sales,low,1
2,0.66,0.93,5,253,5,0,0,product_mng,low,1
3,0.46,0.45,2,172,2,1,0,management,low,0
4,0.88,0.75,5,152,3,0,0,hr,high,0


In [3]:
## 2. Changing sales and salary from labels to dummy variables

## Sales
train = pd.concat([train.drop(columns = ['sales']), pd.get_dummies(train['sales'])], axis = 1)
val = pd.concat([val.drop(columns = ['sales']), pd.get_dummies(val['sales'])], axis = 1)
test = pd.concat([test.drop(columns = ['sales']), pd.get_dummies(test['sales'])], axis = 1)

## Salary
train = pd.concat([train.drop(columns = ['salary']), pd.get_dummies(train['salary'])], axis = 1)
val = pd.concat([val.drop(columns = ['salary']), pd.get_dummies(val['salary'])], axis = 1)
test = pd.concat([test.drop(columns = ['salary']), pd.get_dummies(test['salary'])], axis = 1)

In [4]:
## 3. Engineering the interactions/features from In-Class assignment 9 (the ones from the decision tree)

train['Interaction_1'] = np.where((train['satisfaction_level'] <= 0.465) & (train['number_project'] <= 2.5) & 
                                     (train['last_evaluation'] <= 0.575), 1, 0)
val['Interaction_1'] = np.where((val['satisfaction_level'] <= 0.465) & (val['number_project'] <= 2.5) & 
                                     (val['last_evaluation'] <= 0.575), 1, 0)
test['Interaction_1'] = np.where((test['satisfaction_level'] <= 0.465) & (test['number_project'] <= 2.5) & 
                                     (test['last_evaluation'] <= 0.575), 1, 0)

train['Interaction_2'] = np.where((train['satisfaction_level'] <= 0.465) & (train['number_project'] > 2.5) & 
                                     (train['satisfaction_level'] <= 0.115), 1, 0)
val['Interaction_2'] = np.where((val['satisfaction_level'] <= 0.465) & (val['number_project'] > 2.5) & 
                                     (val['satisfaction_level'] <= 0.115), 1, 0)
test['Interaction_2'] = np.where((test['satisfaction_level'] <= 0.465) & (test['number_project'] > 2.5) & 
                                     (test['satisfaction_level'] <= 0.115), 1, 0)

train['Interaction_3'] = np.where((train['satisfaction_level'] > 0.465) & (train['time_spend_company'] <= 4.5) & 
                                     (train['average_montly_hours'] <= 290.5), 1, 0)
val['Interaction_3'] = np.where((val['satisfaction_level'] > 0.465) & (val['time_spend_company'] <= 4.5) & 
                                     (val['average_montly_hours'] <= 290.5), 1, 0)
test['Interaction_3'] = np.where((test['satisfaction_level'] > 0.465) & (test['time_spend_company'] <= 4.5) & 
                                     (test['average_montly_hours'] <= 290.5), 1, 0)

In [5]:
## Defining input and target variables

X_train = train[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_train = train['left']

X_val = val[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_val = val['left']

X_test = test[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_test = test['left']

## Scaling the input data
scaler = MinMaxScaler(feature_range = (0, 1))
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)
X_test = scaler.fit_transform(X_test)

In [6]:
## 4. Using train data-frame and the top 5 features to perform a hyper-tuning job on the Random Forest model using the 
## GridSearchCV function 

## Defining the parameter dictionary
rf_param_grid = {'n_estimators': [100, 300, 500], 'min_samples_split': [10, 15], 'min_samples_leaf': [5, 7], 'max_depth' : [3, 5, 7]}

## Defining the custom scorer
my_score_function = make_scorer(cf.cost_function, greater_is_better = True, needs_proba = True)

## Running GridSearchCV with 3 folds and the customized cost function
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

## Extracting the best hyper-parameters
rf_md = rf_grid_search.best_estimator_

In [7]:
## Using the optimal model to predict the likelihood of left on the validation and test data-frames
rf_val_pred = rf_md.predict_proba(X_val)[:, 1]
rf_test_pred = rf_md.predict_proba(X_test)[:, 1]

## Finding the optimal cutoff value by comparing the likelihoods of left in validation and the actual left values in 
## the validation
opt_cutoff = cf.cost_function_cutoff(Y_val, rf_val_pred)

## Use this cutoff to change the likelihoods of left in the test data-frame to label
rf_labels = np.where(rf_test_pred < opt_cutoff, 0, 1)

## Compute the cost of this prediction on the test data-frame
X = confusion_matrix(Y_test, rf_labels)
print(X)
print('\nThe cost of the RF model is:', -1500*X[1,0] - 1000*X[0, 1] + 500*X[1,1])

[[1130   13]
 [  28  329]]

The cost of the RF model is: 109500


In [8]:
## 5. Using train data-frame and the top 5 features to perform a hyper-tuning job on the SVM model using the 
## GridSearchCV function 

## Defining the parameter dictionary
SVM_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.01, 0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

## Running GridSearchCV with 3 folds and the customized cost function
svm_grid_search = GridSearchCV(SVC(probability = True), SVM_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

## Extracting the best hyper-parameters
svm_md = svm_grid_search.best_estimator_

In [9]:
## Using the optimal model to predict the likelihood of left on the validation and test data-frames
svm_val_pred = svm_md.predict_proba(X_val)[:, 1]
svm_test_pred = svm_md.predict_proba(X_test)[:, 1]

## Finding the optimal cutoff value by comparing the likelihoods of left in validation and the actual left values in 
## the validation
opt_cutoff = cf.cost_function_cutoff(Y_val, svm_val_pred)

## Use this cutoff to change the likelihoods of left in the test data-frame to label
svm_labels = np.where(svm_test_pred < opt_cutoff, 0, 1)

## Compute the cost of this prediction on the test data-frame
X = confusion_matrix(Y_test, svm_labels)
print(X)
print('\nThe cost of the SVC model is:', -1500*X[1,0] - 1000*X[0, 1] + 500*X[1,1])

[[1121   22]
 [  31  326]]

The cost of the SVC model is: 94500


In [None]:
## Based on the results from parts 5, 6, and 7, we would you use the Random Forest model to predict left 
## because of the better cost function result.