# QN 1: Predicting Employee Exit
### We will be working on Employee Data. This is employee attrition data. The 'left' named column represents exit from the company or not.

### The data consists of categorical & number data. Thus, needs data preprocessing. Make use of preprocessing techniques that you have learnt & build model to predict 'left' named column



In [47]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [48]:
# Reading Dataset
data = pd.read_csv("Data1.csv")

In [49]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [50]:
data.rename(columns={'sales':'department'}, inplace=True)

In [51]:
# Data exploration
data.drop(0,inplace=True)

In [52]:
data.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64

In [53]:
data.shape

(14998, 10)

In [54]:
# finding the correlation of "left" with other columns
data.corr()['left']

satisfaction_level      -0.388316
last_evaluation          0.006698
number_project           0.023966
average_montly_hours     0.071402
time_spend_company       0.144879
Work_accident           -0.154590
left                     1.000000
promotion_last_5years   -0.061777
Name: left, dtype: float64

In [55]:
data.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
department                object
salary                    object
dtype: object

In [56]:
set(data.department)

{'IT',
 'RandD',
 'accounting',
 'hr',
 'management',
 'marketing',
 'product_mng',
 'sales',
 'support',
 'technical'}

In [57]:
c={'IT':0,'RandD':1,'accounting':2,'hr':3,'management':4,'marketing':5,'product_mng':6,'sales':7,'support':8,'technical':9}
data['department']=data['department'].map(c)

In [58]:
set(data.salary)

{'high', 'low', 'medium'}

In [59]:
d={'high':3, 'low':1, 'medium':2}
data['salary']=data['salary'].map(d)

In [60]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
1,0.8,0.86,5,262,6,0,1,0,7,2
2,0.11,0.88,7,272,4,0,1,0,7,2
3,0.72,0.87,5,223,5,0,1,0,7,1
4,0.37,0.52,2,159,3,0,1,0,7,1
5,0.41,0.5,2,153,3,0,1,0,7,1


In [61]:
data.corr()['left']

satisfaction_level      -0.388316
last_evaluation          0.006698
number_project           0.023966
average_montly_hours     0.071402
time_spend_company       0.144879
Work_accident           -0.154590
left                     1.000000
promotion_last_5years   -0.061777
department               0.032062
salary                  -0.157808
Name: left, dtype: float64

In [62]:
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')

In [63]:
emp_data=data[['satisfaction_level', 'last_evaluation', 'number_project','average_montly_hours', 'time_spend_company', 'Work_accident','promotion_last_5years', 'department', 'salary']]
emp_data

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary
1,0.80,0.86,5,262,6,0,0,7,2
2,0.11,0.88,7,272,4,0,0,7,2
3,0.72,0.87,5,223,5,0,0,7,1
4,0.37,0.52,2,159,3,0,0,7,1
5,0.41,0.50,2,153,3,0,0,7,1
...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,8,1
14995,0.37,0.48,2,160,3,0,0,8,1
14996,0.37,0.53,2,143,3,0,0,8,1
14997,0.11,0.96,6,280,4,0,0,8,1


In [64]:
from sklearn.preprocessing import MinMaxScaler

In [65]:
MinMax= MinMaxScaler()

In [66]:
MinMax.fit(emp_data)

In [67]:
feature_data = MinMax.transform(emp_data)

In [68]:
feature_data

array([[0.78021978, 0.78125   , 0.6       , ..., 0.        , 0.77777778,
        0.5       ],
       [0.02197802, 0.8125    , 1.        , ..., 0.        , 0.77777778,
        0.5       ],
       [0.69230769, 0.796875  , 0.6       , ..., 0.        , 0.77777778,
        0.        ],
       ...,
       [0.30769231, 0.265625  , 0.        , ..., 0.        , 0.88888889,
        0.        ],
       [0.02197802, 0.9375    , 0.8       , ..., 0.        , 0.88888889,
        0.        ],
       [0.30769231, 0.25      , 0.        , ..., 0.        , 0.88888889,
        0.        ]])

In [69]:
target_data = data.left
target_data

1        1
2        1
3        1
4        1
5        1
        ..
14994    1
14995    1
14996    1
14997    1
14998    1
Name: left, Length: 14998, dtype: int64

In [70]:
# help(train_test_split)

In [71]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(feature_data,target_data)

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [73]:
model=LogisticRegression(class_weight='balanced')
model1=RandomForestClassifier()

In [74]:
model.fit(train_X,train_Y)

In [87]:
model1.fit(train_X,train_Y)

## Using Logistic Regression

In [76]:

model.score(test_X,test_Y)

0.7552

In [77]:
from sklearn.metrics import recall_score,precision_score, f1_score, classification_report

In [78]:
pred_y=model.predict(test_X)

In [79]:
precision_score(y_pred=pred_y, y_true=test_Y)

0.49339819318971506

In [86]:
f1_score(y_pred=pred_y, y_true=test_Y)

0.6073567151411462

## Using RandomForestClassifier

In [81]:

model1.score(test_X,test_Y)

0.988

In [82]:
from sklearn.metrics import recall_score,precision_score, f1_score, classification_report

In [83]:
pred_Y = model1.predict(test_X)

In [84]:
precision_score(y_pred=pred_Y, y_true=test_Y)

0.9908045977011494

In [85]:
f1_score(y_pred=pred_Y, y_true=test_Y)

0.9745618993781797

In [37]:
print (classification_report(y_pred=pred_Y, y_true=test_Y))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2836
           1       0.99      0.97      0.98       914

    accuracy                           0.99      3750
   macro avg       0.99      0.98      0.99      3750
weighted avg       0.99      0.99      0.99      3750

