In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import precision_recall_cutoff as prc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
import precision_recall_cutoff

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445-wagner'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "turnover.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
#Changing to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,...,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0


In [3]:
#Creating features
turnover['Interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= 0.575), 1, 0)
turnover['Interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] >= 2.5) & (turnover['satisfaction_level'] <= 0.115), 1, 0)
turnover['Interaction_3'] = np.where((turnover['satisfaction_level'] >= 0.465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)

In [4]:
#Defining Varibales
X = turnover.drop(columns = ['left', 'salary'])
Y = turnover['left']

#Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [11]:
#Variable selection only using train dataset
#Defining the list to store the results
results = list()

for i in range (0,10):
    #splitting data
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    #Building the random forest model
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    #Extracting feature importances
    results.append(RF.feature_importances_)

#Changing to data frame
results = pd.DataFrame(results)
results.columns = X.columns
results


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,...,product_mng,sales,support,technical,high,low,medium,Interaction_1,Interaction_2,Interaction_3
0,0.166045,0.032936,0.100791,0.059433,0.079687,0.007902,0.00048,6.5e-05,0.000285,2.7e-05,...,1e-05,7e-05,6.9e-05,0.000134,0.002641,0.005124,0.001172,0.209549,0.113638,0.219483
1,0.186464,0.038763,0.098078,0.061683,0.069206,0.007797,0.000908,1.7e-05,0.000357,1.1e-05,...,3.6e-05,2.8e-05,7.1e-05,0.000173,0.008892,0.005067,0.0012,0.174828,0.104801,0.240844
2,0.19265,0.038152,0.098598,0.05457,0.07114,0.00764,0.00065,5.9e-05,0.000274,3.5e-05,...,5e-05,4.1e-05,3.4e-05,6.3e-05,0.00442,0.007597,0.001293,0.200245,0.114459,0.207626
3,0.172846,0.03745,0.092389,0.060451,0.070778,0.00941,0.000894,4e-05,0.000225,1.7e-05,...,2.6e-05,0.000117,6e-05,6.6e-05,0.003824,0.004675,0.000693,0.198879,0.106966,0.239656
4,0.158882,0.038014,0.098159,0.060536,0.070582,0.008123,0.001178,4.8e-05,0.000316,2.7e-05,...,3.7e-05,4.8e-05,8.6e-05,8e-05,0.003156,0.007557,0.001088,0.200121,0.122076,0.229506
5,0.158516,0.038388,0.106369,0.058861,0.07128,0.007658,0.000475,5.3e-05,0.000263,3.5e-05,...,4.8e-05,7.6e-05,3.1e-05,0.000137,0.003976,0.008463,0.00142,0.206215,0.109742,0.227594
6,0.166011,0.03477,0.108652,0.050963,0.067799,0.008568,0.000919,3.4e-05,0.000242,2.2e-05,...,2.4e-05,4.5e-05,7.8e-05,6.6e-05,0.003202,0.004984,0.00091,0.194641,0.102504,0.255161
7,0.173147,0.039006,0.1024,0.058484,0.061509,0.008882,0.000812,6.7e-05,0.000405,3.8e-05,...,2.9e-05,7.1e-05,0.000135,4.4e-05,0.003687,0.00766,0.000865,0.186629,0.113231,0.242463
8,0.169004,0.041393,0.102698,0.055566,0.060313,0.008973,0.000867,0.000215,0.000457,1e-05,...,3e-05,6.3e-05,5.1e-05,0.000115,0.003121,0.004032,0.000816,0.210022,0.108914,0.233043
9,0.167777,0.029326,0.095546,0.063719,0.063621,0.010665,0.001063,4.3e-05,0.000112,4e-05,...,2.9e-05,6.5e-05,4.5e-05,0.000102,0.003414,0.004976,0.001043,0.201861,0.109736,0.246449


In [12]:
#computing averages and sorting importances
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results = results.sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,Interaction_3,0.234182
20,Interaction_1,0.198299
0,satisfaction_level,0.171134
21,Interaction_2,0.110607
2,number_project,0.100368
4,time_spend_company,0.068592
3,average_montly_hours,0.058427
1,last_evaluation,0.03682
5,Work_accident,0.008562
18,low,0.006014


In [17]:
X_train_1 = X_train[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]
X_test_1 = X_test[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]

#Building model
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_1, Y_train)
    
RF_pred = RF.predict_proba(X_test_1)[:, 1]

#Predicting labels
RF_labels =prc.precision_recall_cutoff(Y_test, RF_pred)

#Computing classification report
print(classification_report(Y_test, RF_labels))


              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2286
           1       0.88      0.89      0.89       714

    accuracy                           0.95      3000
   macro avg       0.92      0.93      0.93      3000
weighted avg       0.95      0.95      0.95      3000



In [18]:
X_train_2 = X_train[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]
X_test_2 = X_test[['Interaction_3', 'Interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]

#Building model
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_2, Y_train)
    
RF_pred = RF.predict_proba(X_test_2)[:, 1]

#Predicting labels
RF_labels =prc.precision_recall_cutoff(Y_test, RF_pred)

#Computing classification report
print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.90      0.91      0.91       714

    accuracy                           0.95      3000
   macro avg       0.94      0.94      0.94      3000
weighted avg       0.96      0.95      0.96      3000



In [None]:
#I would go with the second model as there is a slight overall improvement in the accuracy totals