In [1]:
pip install XGBoost

Collecting XGBoost
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
     |████████████████████████████████| 173.6 MB 10 kB/s                      | 39.9 MB 8.1 MB/s eta 0:00:17
Installing collected packages: XGBoost
Successfully installed XGBoost-1.5.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
## Importing necessary libaries

import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import cost_function as cf
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
## 1. Using the pandas library to read the csv data file and create a data-frame called turnover

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

turnover = pd.read_csv(file_content_stream)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
## 2. Changing sales and salary from labels to dummy variables

## Sales
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)

## Salary
turnover = pd.concat([turnover.drop(columns = ['salary']), pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [5]:
## 3. Engineering the interactions/features from In-Class assignment 9 (the ones from the decision tree)

turnover['Interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= 0.575), 1, 0)

turnover['Interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] > 2.5) & 
                                     (turnover['satisfaction_level'] <= 0.115), 1, 0)

turnover['Interaction_3'] = np.where((turnover['satisfaction_level'] > 0.465) & (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5), 1, 0)

In [6]:
## 4. Using satisfaction level, last evaluation, number project, average montly hours, time spend company, Work accident, 
## promotion last 5years, sales (dummy variables), and salary (dummy variables) and interactions/features (from part 3) 
## as the input variables and left as the target variable to split the data into two data-frames 

## Defining the input and target variables
X = turnover.drop(columns = ['left'])
Y = turnover['left']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [7]:
## 5. Using train data-frame and the top 5 features to perform a hyper-tuning job on the Random Forest model using the 
## GridSearchCV function 

## Defining input variables
X_train = X_train[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
X_test = X_test[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]

## Defining the parameter dictionary
rf_param_grid = {'n_estimators': [100, 300, 500], 'min_samples_split': [10, 15], 
                 'min_samples_leaf': [5, 7], 'max_depth' : [3, 5, 7]}

## Defining the customized scoring function
my_scorer = make_scorer(cf.cost_function, greater_is_better = True, needs_proba = True)

## Running GridSearchCV with 3 folds
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 3, scoring = my_scorer, n_jobs = -1).fit(X_train, Y_train)

## Extracting the best hyper-parameters
rf_grid_search.best_params_

{'max_depth': 7,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 500}

In [8]:
## Building a RF model with the best hyper-parameter combination from grid search
rf_md = rf_grid_search.best_estimator_

## Predicting on the test data-frame
rf_preds = rf_md.predict_proba(X_test)[:, 1]

## Finding the optimal cutoff value by comparing the likelihoods of left in test and the actual left values in the test
optimal_cutoff = cf.cost_function_cutoff(Y_test, rf_preds)

## Turning likelihoods into labels
rf_labels = np.where(rf_preds < optimal_cutoff, 0, 1)

## Computing the confusion matrix
X = confusion_matrix(Y_test, rf_labels)
print(X)
print('The cost of the RF model:', (-1500 * X[1, 0]) - (1000 * X[0, 1]) + (500 * X[1, 1]))

[[2258   28]
 [  69  645]]
The cost of the RF model: 191000


In [9]:
## 6. Using train data-frame and the top 5 features to perform a hyper-tuning job on the XGB model using the 
## GridSearchCV function 

## Defining the parameter dictionary
XGBoost_param_grid = {'n_estimators': [500], 'max_depth': [3, 5, 7], 'min_child_weight': [5, 7], 'learning_rate': [0.01], 
                      'gamma': [0.3, 0.1], 'subsample': [0.8, 1], 'colsample_bytree': [1]}

## Running GridSearchCV with 3 folds
xgb_grid_search = GridSearchCV(XGBClassifier(use_label_encoder = False, eval_metric = 'logloss'), XGBoost_param_grid, 
                               cv = 3, scoring = my_scorer, n_jobs = -1).fit(X_train, Y_train)

## Extracting the best hyper-parameters
xgb_grid_search.best_params_

{'colsample_bytree': 1,
 'gamma': 0.3,
 'learning_rate': 0.01,
 'max_depth': 7,
 'min_child_weight': 7,
 'n_estimators': 500,
 'subsample': 1}

In [10]:
## Building a XGB model with the best hyper-parameter combination from grid search
xgb_md = xgb_grid_search.best_estimator_

## Predicting on the test data-frame
xgb_preds = xgb_md.predict_proba(X_test)[:, 1]

## Finding the optimal cutoff value by comparing the likelihoods of left in test and the actual left values in the test
optimal_cutoff = cf.cost_function_cutoff(Y_test, xgb_preds)

## Turning likelihoods into labels
xgb_labels = np.where(xgb_preds < optimal_cutoff, 0, 1)

## Computing the confusion matrix
X = confusion_matrix(Y_test, xgb_labels)
print(X)
print('The cost of the XGB model:', (-1500 * X[1, 0]) - (1000 * X[0, 1]) + (500 * X[1, 1]))

[[2256   30]
 [  70  644]]
The cost of the XGB model: 187000


## Results: Based on the results from parts 5, and 6, we would use the Random Forest model with the optimal hyper-parameters to predict left because it has the optimal cost value on the test predictions.