In [1]:
#Import the required libaries.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

#Misc to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Mount Drive
from google.colab import drive
drive.mount('/drive/')

ModuleNotFoundError: No module named 'google'

In [None]:
#Change directory 
import os 
os.chdir('/drive/My Drive/data/HR')

In [None]:
#Read the csv using the pandas to df
df = pd.read_csv('HR_data.csv')

In [None]:
#Display first 5 rows
df.head()

In [None]:
#Checking shape
df.shape

In [None]:
#Checking for nan values
df.isnull().sum()

In [None]:
#Cheking dtype of each attribute
df.dtypes

In [None]:
df.columns

In [None]:
#Renaming the column names
df = df.rename(columns={'Candidate.Ref':'Candidate_ref', 'DOJ.Extended':'DOJ_extended', 'Duration.to.accept.offer':'Accept_duration',
       'Notice.period':'Notice_period', 'Offered.band':'Offered_band', 'Pecent.hike.expected.in.CTC':'Percent_hike_expected',
       'Percent.hike.offered.in.CTC':'Percent_hike_offered', 'Percent.difference.CTC':'Percent_difference',
       'Joining.Bonus':'Joining_Bonus', 'Candidate.relocate.actual':'Relocated','Candidate.Source':'Source','Rex.in.Yrs':'Rex'})

In [None]:
df.columns

In [None]:
#Rearranging Columns names
df = df[['SLNO', 'Candidate_ref', 'Accept_duration','Notice_period','Percent_hike_expected',
         'Percent_hike_offered', 'Percent_difference','Rex','Age','DOJ_extended','Offered_band'
         ,'Joining_Bonus', 'Relocated', 'Gender','Source', 'LOB', 'Location','Status']]

In [None]:
df.head()

In [None]:
#Copying the df to df1
df1 = df.copy()

In [None]:
#Dropping SLNO which is just like an Index
df1.drop('SLNO',1,inplace=True)

In [None]:
#Review the first 5 rows
df1.head()

In [None]:
df1.columns

In [None]:
#Dropping the "Candidate_ref" because it is just a unique id given to candidate like random genrated index ID
df1.drop('Candidate_ref',1,inplace=True)

In [None]:
#Head
df1.head()

In [None]:
#Independented Varibales
x = df1.iloc[:,:-1]
x.head()

In [None]:
#Dependented Varibales
y = df1.iloc[:,-1]
y.head()

In [None]:
#Check unique attributes
df1['Status'].unique()

In [None]:
#Encoding the dependented variable to binary > Joined to '0' Not Joined to '1'
y.replace(['Joined','Not Joined'],[0,1],inplace=True)

In [None]:
xd = pd.get_dummies(x,drop_first=True)

In [None]:
xd.keys()

In [None]:
#Importing the imblearn
import imblearn
from imblearn.over_sampling import SMOTE #Importing SMOTE

In [None]:
x_resam,y_resam = SMOTE(k_neighbors=5).fit_resample(xd,y)

In [None]:
#Random seed
np.random.seed(1001)

In [None]:
#Importing train & test split
from sklearn.model_selection import train_test_split

In [None]:
#Spliting to train,test
x_train,x_test,y_train,y_test = train_test_split(x_resam,y_resam,test_size=0.2)

In [None]:
#Checking shape of train,test
x_train.shape,x_test.shape,y_train.shape,y_test.shape

## Decision Tree

In [None]:
#Importing DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#tree_1 as DecisionTreeClassifier
tree_1 = DecisionTreeClassifier(criterion='gini',max_depth=None,max_features=17)

In [None]:
#Fitting
tree_1.fit(x_train,y_train)

In [None]:
#Checking max_depth
tree_1.tree_.max_depth

In [None]:
#Checking important features
imp_feature = tree_1.tree_.compute_feature_importances()

In [None]:
#Plotting important features
plt.figure(figsize=(20,10))
pd.Series(imp_feature,index=xd.columns).sort_values().plot(kind='barh')
plt.show()

In [None]:
#Checking score of train and test
tree_1.score(x_train,y_train),tree_1.score(x_test,y_test)

In [None]:
#Predicting train values
y_train_pred = tree_1.predict(x_train)

In [None]:
#Predicting test values
y_test_pred = tree_1.predict(x_test)

In [None]:
#Taking prob of positive of test
y_test_prob = tree_1.predict_proba(x_test)[:,1]

In [None]:
#Importing a custom module innomatics
import innomatics

In [None]:
#classification metric
innomatics.classification_metric(y_test,y_test_pred,y_test_prob,label=['Joined','Not Joined'],n=50,verbose=True)

## Purning


In [None]:
def purning(x_train,y_train,x_test,y_test,criterion,max_depth):
  """
    A Simple purning function for plotting the scores of train and test using
    different criterion like "gini","entropy".

    !Caution import required libaries before using this function.
    
    Required: DecisionTreeClassifier
  """
  #Purning
  train_acc = []
  test_acc = []
  depth = np.arange(2,max_depth)
  for i in depth:
    model_purning = DecisionTreeClassifier(criterion=criterion,max_depth=i)
    model_purning.fit(x_train,y_train)
    train_a = model_purning.score(x_train,y_train)
    test_a = model_purning.score(x_test,y_test)
    train_acc.append(train_a)
    test_acc.append(test_a)

  plt.figure(figsize=(10,6))
  plt.plot(depth,train_acc)
  plt.plot(depth,test_acc)
  plt.xlabel('Depth')
  plt.ylabel('Score')
  plt.legend(['train','test'])
  plt.show()

In [None]:
#purning with gini criterion
purning(x_train,y_train,x_test,y_test,'gini',31)

In [None]:
#purning with entropy criterion
purning(x_train,y_train,x_test,y_test,'entropy',31)

**From above plots selecting max depth as the 12**

In [None]:
#Now selecting the model with above value 12 and max_feature of 17(from previous notebook)
model_pur = DecisionTreeClassifier(criterion='gini',max_depth=11,max_features=17)
model_pur.fit(x_train,y_train)
yp_test_pred = model_pur.predict(x_test)
yp_test_prob = model_pur.predict_proba(x_test)[:,1]

In [None]:
innomatics.classification_metric(y_test,yp_test_pred,yp_test_prob,
                                 label=['Joined','Not Joined'],n=10,verbose=True)

In [None]:
#Assuming Same criteria for DecisionTree also
from sklearn.metrics import confusion_matrix, classification_report
def FP(x_train,y_train,cri,ratio=3):
  """
    A Function for filtering the giving criteria which is ratio between
    the FP's to FN's should be 3 times.

    cri: can be criterion for classsifer like 'gini' are 'entropy'.

    ratio: ratio between the FP's to FN (default its 3).

    You can select the max_depth and max_features which is index.

    Note: No Combinations are applied in this function. max_depth = max_features.

  """
  FP = []
  FN = []
  max_depth = np.arange(2,27)
  for m in max_depth:
    model_tune = DecisionTreeClassifier(criterion=cri,max_depth=m,max_features=m,random_state=0)
    model_tune.fit(x_train,y_train)
    y_pred = model_tune.predict(x_train)
    tn, fp, fn, tp = confusion_matrix(y_train,y_pred).ravel()
    FP.append(fp)
    FN.append(fn)
  MAX = pd.DataFrame(index=max_depth)
  MAX['FP'] = FP
  MAX['FN'] = FN
  MAX['Ratio'] = np.array(FP)/np.array(FN)
  return MAX[MAX["Ratio"]>ratio]


In [None]:
#Ratio for gini
FP(x_train,y_train,'gini')

In [None]:
#Ration for entropy
FP(x_train,y_train,'entropy')

We can observe the near the max_depth of 5 the FP's to FN's ratio is quite ok its is near to 3 with entropy criterion.

In [None]:
#Model with max_depth 5 and max_feature 5
model_para = DecisionTreeClassifier(criterion='entropy',max_depth=5,max_features=5,random_state=0)
model_para.fit(x_train,y_train)

In [None]:
innomatics.classification_metric(y_train,model_para.predict(x_train),model_para.predict_proba(x_train)[:,1]
                                 ,label=['Joined','Not Joined'],n=5,verbose=True)

In [None]:
#Extracting important features
imp_features = model_para.tree_.compute_feature_importances()

In [None]:
#Plotting
plt.figure(figsize=(15,10))
pd.Series(imp_features,index=xd.columns).sort_values().plot(kind='barh')
plt.show()

In [None]:
#Getting tree
from sklearn.tree import export_graphviz
import pydotplus
export_graphviz(model_para,out_file='final.odt',
                feature_names=xd.columns,filled=True)
graph = pydotplus.graph_from_dot_file('final.odt')
graph.write_pdf('final.pdf')
graph.write_jpg('final.jpg')

In [None]:
#Importing mpimg for images
import matplotlib.image as mpimg

In [None]:
#Read and displaying the saved tree
tree = mpimg.imread('final.jpg')
plt.figure(figsize=(20,20))
plt.title('Final Tree',fontsize=20)
plt.imshow(tree)
plt.grid()