In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
######## Importing the modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report


In [None]:
data= pd.read_excel("/content/drive/MyDrive/Batch123/HR Attrition.xlsx")

In [None]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,College,Life Sciences,1,Medium,Female,...,Low,80,0,8,0,Bad,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,Below_college,Life Sciences,2,High,Male,...,Very High,80,1,10,3,Better,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,College,Other,4,Very high,Male,...,Medium,80,0,7,3,Better,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,Master,Life Sciences,5,Very high,Female,...,High,80,0,8,3,Better,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,Below_college,Medical,7,Low,Male,...,Very High,80,1,6,3,Better,2,2,2,2


In [None]:
print("The number of records are {} and the number of columns {} in the data ".format(data.shape[0],data.shape[1]))

The number of records are 1470 and the number of columns 31 in the data 


In [None]:
data.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                   object
EducationField              object
EmployeeNumber               int64
EnvironmentSatisfaction     object
Gender                      object
JobInvolvement              object
JobLevel                     int64
JobRole                     object
JobSatisfaction             object
MaritalStatus               object
MonthlyIncome                int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating           object
RelationshipSatisfaction    object
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance             object
YearsAtCompany               int64
YearsInCurrentRole  

In [None]:
#### Train Test Split
X_train,X_test,y_train,y_test=train_test_split(data.loc[:,data.columns !='Attrition'],data.loc[:,data.columns=='Attrition'],test_size=0.1,random_state=123)

In [None]:
cat_cols=list(X_train.select_dtypes(include='object').columns)
print(cat_cols)


['BusinessTravel', 'Department', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'Over18', 'OverTime', 'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance']


In [None]:
cat_cols.extend(['StockOptionLevel','JobLevel'])
cat_cols

['BusinessTravel',
 'Department',
 'Education',
 'EducationField',
 'EnvironmentSatisfaction',
 'Gender',
 'JobInvolvement',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'Over18',
 'OverTime',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'WorkLifeBalance',
 'StockOptionLevel',
 'JobLevel']

In [None]:
######### Type conversions
X_train[cat_cols]=X_train[cat_cols].astype('category')
X_test[cat_cols]=X_test[cat_cols].astype('category')
print(X_train.dtypes)

Age                            int64
BusinessTravel              category
Department                  category
DistanceFromHome               int64
Education                   category
EducationField              category
EmployeeNumber                 int64
EnvironmentSatisfaction     category
Gender                      category
JobInvolvement              category
JobLevel                    category
JobRole                     category
JobSatisfaction             category
MaritalStatus               category
MonthlyIncome                  int64
NumCompaniesWorked             int64
Over18                      category
OverTime                    category
PercentSalaryHike              int64
PerformanceRating           category
RelationshipSatisfaction    category
StandardHours                  int64
StockOptionLevel            category
TotalWorkingYears              int64
TrainingTimesLastYear          int64
WorkLifeBalance             category
YearsAtCompany                 int64
Y

In [None]:
### Drop the Employee Number
X_train.drop(['EmployeeNumber'],axis=1,inplace=True)
X_test.drop(['EmployeeNumber'],axis=1,inplace=True)

In [None]:
######## Check for the missing values
X_train.isna().sum()

Age                         0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [None]:
### But there could be missing values in the test so we need to make sure that they are addressed
si_num=SimpleImputer(strategy="mean")
si_cat=SimpleImputer(strategy='most_frequent')

In [None]:
X_train_num=X_train.drop(cat_cols,axis=1)
X_train_cat=X_train[cat_cols]

### on test
X_test_num=X_test.drop(cat_cols,axis=1)
X_test_cat=X_test[cat_cols]

In [None]:
X_train_num=pd.DataFrame(si_num.fit_transform(X_train_num),columns=X_train_num.columns)
X_train_cat=pd.DataFrame(si_cat.fit_transform(X_train_cat),columns=X_train_cat.columns)

In [None]:
X_test_num=pd.DataFrame(si_num.transform(X_test_num),columns=X_test_num.columns)
X_test_cat=pd.DataFrame(si_cat.transform(X_test_cat),columns=X_test_cat.columns)

In [None]:
### Statndardization of the numeric data
std= StandardScaler()
X_train_num=pd.DataFrame(std.fit_transform(X_train_num),columns= X_train_num.columns)
X_test_num=pd.DataFrame(std.transform(X_test_num),columns=X_test_num.columns)

In [None]:
X_train_cat.describe(include='all')

Unnamed: 0,BusinessTravel,Department,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobRole,JobSatisfaction,MaritalStatus,Over18,OverTime,PerformanceRating,RelationshipSatisfaction,WorkLifeBalance,StockOptionLevel,JobLevel
count,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323,1323
unique,3,3,5,6,4,2,4,9,4,3,1,2,2,4,4,4,5
top,Travel_Rarely,Research & Development,Bachelor,Life Sciences,High,Male,High,Sales Executive,Very High,Married,Y,No,Excellent,High,Better,1,1
freq,932,867,511,540,416,798,783,295,414,607,1323,937,1121,414,795,552,496


In [None]:
## One-hot encoding of categorical data
ohe=OneHotEncoder(handle_unknown='ignore')

In [None]:
X_train_cat=pd.DataFrame(ohe.fit_transform(X_train_cat).todense(),columns=ohe.get_feature_names_out())

In [None]:
X_test_cat=pd.DataFrame(ohe.transform(X_test_cat).todense(),columns=ohe.get_feature_names_out())

In [None]:
######### Combining Numeric and Categorical Data
Train=pd.concat([X_train_num,X_train_cat],axis=1)

In [None]:
print(Train.shape)

(1323, 77)


In [None]:
Test= pd.concat([X_test_num,X_test_cat],axis=1)

In [None]:
print(Test.shape)

(147, 77)


In [None]:
### Mod
y_train=y_train.astype('category')
y_test=y_test.astype('category')

In [None]:
##########Models
mod=SVC(kernel='rbf',C=2,class_weight='balanced')
mod.fit(Train,y_train)

######Predictions
preds_train=mod.predict(Train)
preds_test=mod.predict(Test)

  y = column_or_1d(y, warn=True)


In [None]:
confusion_matrix(y_train,preds_train)

array([[1053,   50],
       [   4,  216]])

In [None]:
#######Model2
mod_DT= DecisionTreeClassifier(class_weight='balanced')
mod_DT.fit(Train,y_train)

######Predictions
preds_train_DT=mod_DT.predict(Train)
preds_test_DT=mod_DT.predict(Test)

In [None]:
confusion_matrix(y_train,preds_train_DT)

array([[1103,    0],
       [   0,  220]])

In [None]:
######Model3
mod_Log=LogisticRegression(class_weight='balanced',max_iter=500)

mod_Log.fit(Train,y_train)

######Predictions
preds_train_LG=mod_Log.predict(Train)
preds_test_LG=mod_Log.predict(Test)


  y = column_or_1d(y, warn=True)


In [None]:
confusion_matrix(y_train,preds_train_LG)

array([[881, 222],
       [ 39, 181]])

In [None]:
##########Model4
mod_knn=KNeighborsClassifier(n_neighbors=3)
mod_knn.fit(Train,y_train)

######Predictions
preds_train_knn=mod_knn.predict(Train)
preds_test_knn=mod_knn.predict(Test)


  return self._fit(X, y)


In [None]:
confusion_matrix(y_train,preds_train_knn)

array([[1088,   15],
       [ 133,   87]])

In [None]:
confusion_matrix(y_test,preds_test_knn)

array([[126,   4],
       [ 13,   4]])

In [None]:
data_res=pd.DataFrame({'SVM':preds_train,'DT':preds_train_DT,'LR':preds_train_LG,'KNN':preds_train_knn})
tes_res=pd.DataFrame({'SVM':preds_test,'DT':preds_test_DT,'LR':preds_test_LG,'KNN':preds_test_knn})

In [None]:
data_res.head()

Unnamed: 0,SVM,DT,LR,KNN
0,No,No,No,No
1,No,No,No,No
2,No,No,No,No
3,No,No,No,No
4,Yes,Yes,Yes,No


In [None]:
y_train

Unnamed: 0,Attrition
1088,No
1151,No
472,No
1282,No
857,Yes
...,...
1041,No
1122,No
1346,No
1406,No


In [None]:
from sklearn.ensemble import GradientBoostingClassifier