In [11]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [12]:
attrdata = pd.read_csv("Table_1.csv")
attrdata.head()

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),Marital Status,Age in YY.,Hiring Source,Promoted/Non Promoted,Job Role Match,Stay/Left
0,1,sid,9876544345,Pune,B2,Operation,Male,0.0,< =1,6.08,Single,27.12,Direct,Non Promoted,Yes,Left
1,2,sid,9876544345,Noida,B7,Support,Male,0.0,< =1,13.0,Marr.,38.08,Direct,Promoted,No,Stay
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,Marr.,36.04,Direct,Promoted,Yes,Stay
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,Marr.,32.07,Direct,Promoted,Yes,Stay
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.0,< =1,7.0,Marr.,32.05,Direct,Non Promoted,Yes,Stay


### Preprocessing

In [13]:
attrdata.drop(0,inplace=True)
attrdata.dropna(axis=0,inplace=True)
attrdata.shape

(895, 16)

In [14]:
gender_dict = attrdata["Gender "].value_counts()

promoted_dict = attrdata["Promoted/Non Promoted"].value_counts()

Marital_dict = attrdata["Marital Status"].value_counts()

location_dict = attrdata["Location"].value_counts()

location_dict_new = {
    'Chennai':       7,
    'Noida':         6,
    'Bangalore':     5,
    'Hyderabad':     4,
    'Pune':          3,
    'Madurai':       2,
    'Lucknow':       1,
    'other place':         0,
}

Emp_dict = attrdata["Emp. Group"].value_counts()

Emp_dict['other group'] = 1

Emp_dict_new = {
    'B1': 4,
    'B2': 3,
    'B3': 2,
    'other group': 1,
}

In [15]:
def location(x):
    if str(x) in location_dict_new.keys():
        return location_dict_new[str(x)]
    else:
        return location_dict_new['other place']

def Mar(x):
    if str(x) in Marital_dict.keys() and Marital_dict[str(x)] > 100:
        return str(x)
    else:
        return 'other status'

def Gen(x):
    if x in gender_dict.keys():
        return str(x)
    else:
        return 'other'
    
def Promoted(x):
    if x == 'Promoted':
        return int(1)
    else:
        return int(0)

def Job(x):
    if x == 'Yes':
        return int(1)
    else:
        return int(0)

def emp(x):
    if str(x) in Emp_dict_new.keys():
        return str(x)
    else:
        return 'other group'
    

In [16]:
attrdata['New Location'] = attrdata["Location"].apply(location)

attrdata['New Marital'] = attrdata["Marital Status"].apply(Mar)
    
attrdata['New Promotion'] = attrdata["Promoted/Non Promoted"].apply(Promoted)
    
attrdata['New EMP'] = attrdata["Emp. Group"].apply(emp)
    
attrdata['New Job Role Match'] = attrdata["Job Role Match"].apply(Job)

attrdata['New Gender'] = attrdata["Gender "].apply(Gen)
 
attrdata.head()

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),...,Hiring Source,Promoted/Non Promoted,Job Role Match,Stay/Left,New Location,New Marital,New Promotion,New EMP,New Job Role Match,New Gender
1,2,sid,9876544345,Noida,B7,Support,Male,0.0,< =1,13.0,...,Direct,Promoted,No,Stay,6,Marr.,1,other group,0,Male
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,...,Direct,Promoted,Yes,Stay,5,Marr.,1,B3,1,Male
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,...,Direct,Promoted,Yes,Stay,6,Marr.,1,B2,1,Male
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.0,< =1,7.0,...,Direct,Non Promoted,Yes,Stay,1,Marr.,0,B2,1,Male
5,6,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,9.01,...,Direct,Promoted,No,Stay,5,Marr.,1,B3,0,Male


In [17]:
gen = pd.get_dummies(attrdata["Function"])

hr = pd.get_dummies(attrdata["Hiring Source"])

empe = pd.get_dummies(attrdata["New EMP"])

Mr = pd.get_dummies(attrdata["New Marital"])

gend = pd.get_dummies(attrdata["New Gender"])

tengrp = pd.get_dummies(attrdata["Tenure Grp."])

dataset = pd.concat([attrdata, hr, Mr, empe, tengrp, gen, gend], axis = 1)

dataset.head()

Unnamed: 0,table id,name,phone number,Location,Emp. Group,Function,Gender,Tenure,Tenure Grp.,Experience (YY.MM),...,B3,other group,< =1,> 1 & < =3,Operation,Sales,Support,Female,Male,other
1,2,sid,9876544345,Noida,B7,Support,Male,0.0,< =1,13.0,...,0,1,1,0,0,0,1,0,1,0
2,3,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,16.05,...,1,0,1,0,1,0,0,0,1,0
3,4,sid,9876544345,Noida,B2,Operation,Male,0.01,< =1,6.06,...,0,0,1,0,1,0,0,0,1,0
4,5,sid,9876544345,Lucknow,B2,Operation,Male,0.0,< =1,7.0,...,0,0,1,0,1,0,0,0,1,0
5,6,sid,9876544345,Bangalore,B3,Operation,Male,0.01,< =1,9.01,...,1,0,1,0,1,0,0,0,1,0


In [8]:
dataset.drop(["table id", "name", "Marital Status","Promoted/Non Promoted","Function","Emp. Group","Job Role Match","Location"
              ,"Hiring Source","Gender ", 'Tenure', 'New Gender', 'New Marital', 'New EMP'],axis=1,inplace=True)

dataset1 = dataset.drop(['Tenure Grp.', 'phone number'], axis = 1)

dataset1

Unnamed: 0,Experience (YY.MM),Age in YY.,Stay/Left,New Location,New Promotion,New Job Role Match,Agency,Direct,Employee Referral,Marr.,...,B3,other group,< =1,> 1 & < =3,Operation,Sales,Support,Female,Male,other
1,13.00,38.08,Stay,6,1,0,0,1,0,1,...,0,1,1,0,0,0,1,0,1,0
2,16.05,36.04,Stay,5,1,1,0,1,0,1,...,1,0,1,0,1,0,0,0,1,0
3,6.06,32.07,Stay,6,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,1,0
4,7.00,32.05,Stay,1,0,1,0,1,0,1,...,0,0,1,0,1,0,0,0,1,0
5,9.01,39.09,Stay,5,1,0,0,1,0,1,...,1,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,0.03,24.06,Stay,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
897,0.03,21.09,Stay,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
898,0.03,22.02,Left,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
900,0.03,22.02,Stay,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0


## Building models

In [9]:
dataset1.to_csv("processed table.csv")
dataset = pd.read_csv("processed table.csv")
dataset = pd.DataFrame(dataset)

y = dataset["Stay/Left"]
X = dataset.drop("Stay/Left",axis=1)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4)

lr=LogisticRegression(C = 0.1, random_state = 42, solver = 'liblinear')
dt=DecisionTreeClassifier()
rm=RandomForestClassifier()
gnb=GaussianNB()

for a, b in zip([lr, dt, rm, gnb], ["Logistic Regression", "Decision Tree", "Random Forest", "Naive Bayes"]):
    a.fit(X_train,y_train)
    prediction = a.predict(X_train)
    y_pred = a.predict(X_test)
    score1 = accuracy_score(y_train,prediction)
    score = accuracy_score(y_test,y_pred)
    msg1="[%s] training data accuracy is : %f" % (b,score1)
    msg2="[%s] test data accuracy is : %f" % (b,score)
    print(msg1)
    print(msg2)

[Logistic Regression] training data accuracy is : 0.891061
[Logistic Regression] test data accuracy is : 0.877095
[Decision Tree] training data accuracy is : 1.000000
[Decision Tree] test data accuracy is : 0.854749
[Random Forest] training data accuracy is : 1.000000
[Random Forest] test data accuracy is : 0.882682
[Naive Bayes] training data accuracy is : 0.870112
[Naive Bayes] test data accuracy is : 0.826816


In [10]:
model = lr.fit(X_train,y_train)

filename = 'finalized_model.pickle'
pickle.dump(model, open(filename, 'wb'))