In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("sample.csv")

In [3]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [5]:
columns = ['DailyRate', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'HourlyRate', 'MonthlyRate',
        'Over18', 'RelationshipSatisfaction', 'StandardHours']
df.drop(columns, inplace=True, axis=1)

In [6]:
df.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EnvironmentSatisfaction    0
Gender                     0
JobInvolvement             0
JobLevel                   0
JobRole                    0
JobSatisfaction            0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
OverTime                   0
PercentSalaryHike          0
PerformanceRating          0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
WorkLifeBalance            0
YearsAtCompany             0
YearsInCurrentRole         0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [7]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,...,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,2,Female,3,2,...,11,3,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,3,Male,2,2,...,23,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,4,Male,2,1,...,15,3,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,4,Female,3,1,...,11,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,1,Male,3,1,...,12,3,1,6,3,3,2,2,2,2


In [8]:
labeled_columns = ['Attrition', 'BusinessTravel', 'Department',
                      'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

In [9]:
from sklearn import preprocessing
data_encoded = df.copy(deep=True)
#Use Scikit-learn label encoding to encode character data
lab_enc = preprocessing.LabelEncoder()
for col in labeled_columns:
        data_encoded[col] = lab_enc.fit_transform(df[col])
        le_name_mapping = dict(zip(lab_enc.classes_, lab_enc.transform(lab_enc.classes_)))
        print('Feature', col)
        print('mapping', le_name_mapping)

Feature Attrition
mapping {'No': 0, 'Yes': 1}
Feature BusinessTravel
mapping {'Non-Travel': 0, 'Travel_Frequently': 1, 'Travel_Rarely': 2}
Feature Department
mapping {'Human Resources': 0, 'Research & Development': 1, 'Sales': 2}
Feature Gender
mapping {'Female': 0, 'Male': 1}
Feature JobRole
mapping {'Healthcare Representative': 0, 'Human Resources': 1, 'Laboratory Technician': 2, 'Manager': 3, 'Manufacturing Director': 4, 'Research Director': 5, 'Research Scientist': 6, 'Sales Executive': 7, 'Sales Representative': 8}
Feature MaritalStatus
mapping {'Divorced': 0, 'Married': 1, 'Single': 2}
Feature OverTime
mapping {'No': 0, 'Yes': 1}


In [10]:
independent_data = data_encoded.drop(['Attrition'], axis=1)

In [11]:
dependent_data = data_encoded[['Attrition']]

In [12]:
independent_data.columns

Index(['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel',
       'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [13]:
col_values = list(independent_data.columns.values)

In [14]:
from sklearn.feature_selection import mutual_info_classif

feature_scores = mutual_info_classif(independent_data, dependent_data)
for score, fname in sorted(zip(feature_scores, col_values), reverse=True)[:20]:
    print(fname, score)


  return f(*args, **kwargs)


OverTime 0.034993894989774565
MonthlyIncome 0.03376486773006904
TotalWorkingYears 0.030481405464108047
StockOptionLevel 0.02729798640265635
YearsWithCurrManager 0.024985048636224638
JobLevel 0.023840532763818034
Age 0.02277225101347735
JobSatisfaction 0.02138260359478439
YearsInCurrentRole 0.02046071427264695
JobRole 0.019885782549507125
MaritalStatus 0.019119980603063302
NumCompaniesWorked 0.015156749917419665
YearsAtCompany 0.01432187275868424
YearsSinceLastPromotion 0.007745882537487647
Department 0.007553294896030849
JobInvolvement 0.005189456266561976
TrainingTimesLastYear 0.0015861617825405627
BusinessTravel 0.0010226062000580072
Education 0.0009949277572069981
WorkLifeBalance 0.000814528784395252


In [15]:
from sklearn.feature_selection import chi2
feature_scores = chi2(independent_data, dependent_data)[0]
for score, fname in sorted(zip(feature_scores, col_values), reverse=True)[:20]:
    print(fname, score)

MonthlyIncome 127922.29369381821
TotalWorkingYears 230.72161773754925
YearsAtCompany 142.10005430324915
YearsInCurrentRole 117.5225958913567
YearsWithCurrManager 110.6715338985734
Age 84.15527681001525
OverTime 63.84506671452294
DistanceFromHome 63.77214163101213
StockOptionLevel 25.26882603175403
JobLevel 24.93924234571862
MaritalStatus 18.745657458341153
JobRole 9.004448467467538
YearsSinceLastPromotion 7.601723473243281
JobSatisfaction 7.011946634881222
EnvironmentSatisfaction 6.890594338387591
NumCompaniesWorked 6.438654443594589
JobInvolvement 4.60561606667209
TrainingTimesLastYear 3.0857961647837113
Department 1.329297319538817
WorkLifeBalance 1.0855429627263784


In [16]:
selected_data = data_encoded[['MonthlyIncome','EnvironmentSatisfaction','PercentSalaryHike','Age', 'YearsSinceLastPromotion','JobRole','PerformanceRating', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole','JobSatisfaction','YearsWithCurrManager','WorkLifeBalance', 'JobInvolvement',
                     'JobLevel',  'Attrition']]

In [17]:
selected_data.head()

Unnamed: 0,MonthlyIncome,EnvironmentSatisfaction,PercentSalaryHike,Age,YearsSinceLastPromotion,JobRole,PerformanceRating,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,JobSatisfaction,YearsWithCurrManager,WorkLifeBalance,JobInvolvement,JobLevel,Attrition
0,5993,2,11,41,0,7,3,8,6,4,4,5,1,3,2,1
1,5130,3,23,49,1,6,4,10,10,7,2,7,3,2,2,0
2,2090,4,15,37,0,2,3,7,0,0,3,0,3,2,1,1
3,2909,4,11,33,3,6,3,8,8,7,3,0,3,3,1,0
4,3468,1,12,27,2,2,3,6,2,2,2,2,3,3,1,0


In [18]:
independent_data = selected_data.drop(['Attrition'], axis=1)
dependent_data = selected_data[['Attrition']]

In [19]:
X = independent_data
y = dependent_data

In [20]:
independent_data.head()

Unnamed: 0,MonthlyIncome,EnvironmentSatisfaction,PercentSalaryHike,Age,YearsSinceLastPromotion,JobRole,PerformanceRating,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,JobSatisfaction,YearsWithCurrManager,WorkLifeBalance,JobInvolvement,JobLevel
0,5993,2,11,41,0,7,3,8,6,4,4,5,1,3,2
1,5130,3,23,49,1,6,4,10,10,7,2,7,3,2,2
2,2090,4,15,37,0,2,3,7,0,0,3,0,3,2,1
3,2909,4,11,33,3,6,3,8,8,7,3,0,3,3,1
4,3468,1,12,27,2,2,3,6,2,2,2,2,3,3,1


In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

StandardScaler()

In [22]:
standardized_data = scaler.transform(X)
print(standardized_data)

[[-0.10834951 -0.66053067 -1.1505541  ... -2.49382042  0.37967213
  -0.05778755]
 [-0.29171859  0.25462493  2.12930601 ...  0.33809616 -1.02616674
  -0.05778755]
 [-0.93765369  1.16978053 -0.0572674  ...  0.33809616 -1.02616674
  -0.96148639]
 ...
 [-0.07669019 -0.66053067  1.30934098 ...  0.33809616  1.78551099
  -0.05778755]
 [-0.23647414  1.16978053 -0.33058907 ... -1.07786213 -1.02616674
  -0.05778755]
 [-0.44597809 -0.66053067 -0.87723243 ...  1.75405446  1.78551099
  -0.05778755]]


In [23]:
X = standardized_data
y = dependent_data

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [25]:
print(X.shape,X_train.shape,X_test.shape,y.shape,y_train.shape,y_test.shape)

(1470, 15) (1176, 15) (294, 15) (1470, 1) (1176, 1) (294, 1)


In [26]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

  model.fit(X_train, y_train)


0.8129251700680272

In [27]:
y_pred = model.predict(X_test)

In [28]:
from sklearn.metrics import confusion_matrix,classification_report
print ("Confusion Matrix: \n {} " .format(confusion_matrix(y_test,y_pred))),"\n"
print ("Classification_report : \n {} ".format(classification_report(y_test,y_pred))),"\n"

Confusion Matrix: 
 [[232   4]
 [ 51   7]] 
Classification_report : 
               precision    recall  f1-score   support

           0       0.82      0.98      0.89       236
           1       0.64      0.12      0.20        58

    accuracy                           0.81       294
   macro avg       0.73      0.55      0.55       294
weighted avg       0.78      0.81      0.76       294
 


(None, '\n')

In [29]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

#accuracy score
classifier.score(X_test, y_test)

0.7074829931972789

In [30]:
from sklearn.linear_model import LogisticRegression
Linear = LogisticRegression()
Linear.fit(X_train, y_train)
Linear.score(X_test, y_test)


  return f(*args, **kwargs)


0.8129251700680272

In [31]:
y_pred = model.predict(X_test)


In [32]:
from sklearn.metrics import confusion_matrix,classification_report


In [33]:
print ("Confusion Matrix: \n {} " .format(confusion_matrix(y_test,y_pred))),"\n"
print ("Classification_report : \n {} ".format(classification_report(y_test,y_pred))),"\n"

Confusion Matrix: 
 [[232   4]
 [ 51   7]] 
Classification_report : 
               precision    recall  f1-score   support

           0       0.82      0.98      0.89       236
           1       0.64      0.12      0.20        58

    accuracy                           0.81       294
   macro avg       0.73      0.55      0.55       294
weighted avg       0.78      0.81      0.76       294
 


(None, '\n')

In [34]:
import numpy as np
input_data = (5993,2,11,41,0,7,3,8,6,4,4,5,1,3,2)
#changing the input data into numpy array
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
std_data = scaler.transform(input_data_reshaped)


prediction = model.predict(std_data) 
print(prediction)

if (prediction[0] == 0):
    print('Employee likely to leave')
else:
    print('Employee not leaving')


[1]
Employee not leaving


In [35]:
import joblib
joblib.dump(model,'model.pkl')

['model.pkl']