In [134]:
# import libraries
import pandas as pd

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [135]:
# instantiate data source
df = pd.read_csv('csvFiles/WAGE.csv')

# Data Exploration

In [136]:
# Data Exploration

# find out the number of employees who left the company and those who didn’t
df['left'].value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [137]:
# get the mean between those who left and didn't
df.groupby('left').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
1,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


In [138]:
# calculate categorical means
df.groupby('sales').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
sales,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IT,0.618142,0.71683,3.816626,202.215974,3.468623,0.133659,0.222494,0.002445
RandD,0.619822,0.712122,3.853875,200.800508,3.367217,0.170267,0.153748,0.034307
accounting,0.582151,0.717718,3.825293,201.162973,3.522816,0.125163,0.265971,0.018253
hr,0.598809,0.70885,3.654939,198.684709,3.355886,0.120433,0.290934,0.020298
management,0.621349,0.724,3.860317,201.249206,4.303175,0.163492,0.144444,0.109524
marketing,0.618601,0.715886,3.687646,199.385781,3.56993,0.160839,0.236597,0.050117
product_mng,0.619634,0.714756,3.807095,199.965632,3.47561,0.146341,0.219512,0.0
sales,0.614447,0.709717,3.776329,200.911353,3.534058,0.141787,0.244928,0.024155
support,0.6183,0.723109,3.803948,200.758188,3.393001,0.154778,0.248991,0.008973
technical,0.607897,0.721099,3.877941,202.497426,3.411397,0.140074,0.25625,0.010294


# Data Preprocessing

In [139]:
# Data Preprocessing

# Print column names and sample data
col_names = df.columns.tolist()
print("Column names:")
print(col_names)
print("\nSample data:")
df.head()

Column names:
['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'sales', 'salary']

Sample data:


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [140]:
# rename column name from "sales” to “department”
df=df.rename(columns = {'sales':'department'})

In [141]:
# print the types of the columns
df.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
department                object
salary                    object
dtype: object

In [142]:
# check for missing data
df.isnull().any()

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
department               False
salary                   False
dtype: bool

# Developing the model

In [143]:
cat_vars=['department','salary']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(df[var], prefix=var)
    df1=df.join(cat_list)
    df=df1

In [144]:
df.drop(df.columns[[8, 9]], axis=1, inplace=True)
df.columns.values

array(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'left', 'promotion_last_5years', 'department_IT',
       'department_RandD', 'department_accounting', 'department_hr',
       'department_management', 'department_marketing',
       'department_product_mng', 'department_sales', 'department_support',
       'department_technical', 'salary_high', 'salary_low',
       'salary_medium'], dtype=object)

In [149]:
df_vars=df.columns.values.tolist()
y = df['left']
X = df.drop(['left'], axis = 1)

In [153]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [157]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents,columns = ['PC1', 'PC2'])
finalDf = pd.concat([principalDf, df[['left']]], axis = 1)
finalDf

Unnamed: 0,PC1,PC2,left
0,-2.074657,-1.315119,1
1,2.095799,1.402640,1
2,2.192715,2.299177,1
3,-0.731634,1.870783,1
4,-2.078103,-1.318646,1
...,...,...,...
14994,-2.081007,-1.196139,1
14995,-2.140631,-1.342650,1
14996,-2.155700,-1.375989,1
14997,-0.453976,3.248224,1


In [158]:
X = finalDf.drop(['left'], axis = 1)
y = finalDf['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)
predictions = RFC.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3412
           1       0.88      0.88      0.88      1088

    accuracy                           0.94      4500
   macro avg       0.92      0.92      0.92      4500
weighted avg       0.94      0.94      0.94      4500

