In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
import precision_recall_cutoff as prc
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## 1. Using the pandas library to read the csv data file and create a data-frame called turnover

turnover = pd.read_csv(file_content_stream)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## 2. Changing sales and salary from labels to dummy variables

## Sales
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)

## Salary
turnover = pd.concat([turnover.drop(columns = ['salary']), pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
## 3. Engineering the interactions/features from In-Class assignment 9 (the ones from the decision tree)

turnover['Interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= 0.575), 1, 0)

turnover['Interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] > 2.5) & 
                                     (turnover['satisfaction_level'] <= 0.115), 1, 0)

turnover['Interaction_3'] = np.where((turnover['satisfaction_level'] > 0.465) & (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5), 1, 0)

In [4]:
## 4. Using satisfaction level, last evaluation, number project, average montly hours, time spend company, Work accident, 
## promotion last 5years, sales (dummy variables), and salary (dummy variables) and interactions/features (from part 3) 
## as the input variables and left as the target variable to split the data into two data-frames 

## Defining the input and target variables
X = turnover.drop(columns = ['left'])
Y = turnover['left']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# ## Scaling the input features
# scaler = MinMaxScaler()
# X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
# X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [5]:
## 5. Using train data-frame to run the RFECV algorithm with step = 1, min features to select = 4, cv = 3 and 
## base algorithm random forest (with 500 trees and the maximum depth of each tree equal to 3)

## Building the RFECV
rf_rfe = RFECV(estimator = RandomForestClassifier(max_depth = 3, n_estimators = 500), step = 1,
              min_features_to_select = 4, cv = 3).fit(X_train, Y_train)

## Printing features to select
print(X_train.columns[rf_rfe.support_])

Index(['satisfaction_level', 'Interaction_1', 'Interaction_2',
       'Interaction_3'],
      dtype='object')


In [6]:
## Building a random forest (with 500 trees and the maximum depth of each tree equal to 3) with the selected features 
## from RFECV algorithm

## Defining the new input data
X_train_1 = X_train[['satisfaction_level', 'Interaction_1', 'Interaction_2', 'Interaction_3']]
X_test_1 = X_test[['satisfaction_level', 'Interaction_1', 'Interaction_2', 'Interaction_3']]

## Building the model
rf_md = RandomForestClassifier(max_depth = 3, n_estimators = 500).fit(X_train_1, Y_train)

## Predicting on the test set
rf_preds = rf_md.predict_proba(X_test_1)[:, 1]

## Extracting the predicted labels
rf_labels = prc.precision_recall_cutoff(Y_test, rf_preds)

## Computing the classification report
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2286
           1       0.88      0.92      0.90       714

    accuracy                           0.95      3000
   macro avg       0.93      0.94      0.93      3000
weighted avg       0.95      0.95      0.95      3000



In [7]:
## 6. Using train data-frame and the selected features from part (5) to perform a hyper-tuning job on the random 
## forest model. Using the GridSearchCV function and the following dictionary:

rf_param_grid = {'n_estimators': [100, 300, 500], 'min_samples_split': [10, 15], 
                 'min_samples_leaf': [5, 6, 7], 'max_depth': [3, 4, 5, 7]}

## Performing grid search with five folds
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 5, scoring = 'f1').fit(X_train_1, Y_train)

## Extracting the best model
rf_grid_search.best_estimator_

RandomForestClassifier(max_depth=4, min_samples_leaf=5, min_samples_split=10)

In [8]:
## Building a random forest model with the best hyperparameter combination
rf_md = RandomForestClassifier(max_depth = 4, n_estimators = 500, min_samples_split = 10, 
                               min_samples_leaf = 5).fit(X_train_1, Y_train)

## Predicting on the test set
rf_preds = rf_md.predict_proba(X_test_1)[:, 1]

## Extracting the predicted labels
rf_labels = prc.precision_recall_cutoff(Y_test, rf_preds)

## Computing the classification report
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2286
           1       0.88      0.91      0.90       714

    accuracy                           0.95      3000
   macro avg       0.92      0.94      0.93      3000
weighted avg       0.95      0.95      0.95      3000



In [None]:
## Using the results from part 5 and 6, we would use either of the Random Forest Classifier models with 
## to predict left (equal performance)