                                                __Ensemble Algorithm Implementation__

In [2]:
import pickle   # importing pickle for saving and loading machine learning models
import pandas as pd  # importing pandas for analyzing, cleaning, exploring, and manipulating data
from sklearn.model_selection import train_test_split  # importing train_test_split for spliting the data into training and testing
from preprocessor import *  # importing * for import all functions at once
from imblearn.over_sampling import SMOTE  # importing SMOTE for Balancing the Data
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("HR.csv")

In [4]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [5]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [7]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [8]:
# Splitting dataset

X = df.drop("Attrition", axis = 1)    # All input features except "Attrition"
y = df['Attrition'].map({"No":0, "Yes":1})

In [9]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size = 0.25, random_state = 42)

In [10]:
X

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,...,1,80,0,8,0,1,6,4,0,5
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,...,4,80,1,10,3,3,10,7,1,7
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,...,2,80,0,7,3,3,0,0,0,0
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,...,3,80,0,8,3,3,8,7,3,0
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,3,...,3,80,1,17,3,3,5,2,0,3
1466,39,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,4,...,1,80,1,9,5,3,7,7,1,7
1467,27,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,2,...,2,80,1,6,0,3,6,2,0,3
1468,49,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,4,...,4,80,0,17,3,2,9,6,0,8


In [11]:
# Load a preprocessor object from a pickle file
with open("DataPreprocessingPipeline.pkl", "rb") as f:
    preprocessor = pickle.load(f)

In [12]:
preprocessor

**Transforming data into pipeline**

In [13]:
processed_X_train = preprocessor.fit_transform(X_train)

In [15]:
processed_X_train[0]      # checking first rows of processed_X_train

array([ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        , -0.85215857, -0.50845524, -0.28590556,
        0.70372409, -0.37459995,  0.13887164, -0.01429605, -0.61511095,
       -0.64246019, -0.60313721, -0.36054841, -0.5693691 , -0.34285312,
        3.        ,  4.        ,  3.        ,  1.        ,  1.        ,
       14.        ,  2.        ,  0.        ,  2.        ,  3.        ,
        3.        ])

# Balancing Data

In [16]:
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight="balanced",y=y_train)  # You need to define this function

# Model Building

In [17]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(processed_X_train, y_train, sample_weight = sample_weights)

# Validating the model

In [18]:
processed_X_test = preprocessor.transform(X_test)

In [19]:
from sklearn.metrics import accuracy_score,classification_report,f1_score
y_predict=dt.predict(processed_X_test)    # predicting the model
acc=accuracy_score(y_test,y_predict)
acc

0.7771739130434783

In [20]:
f1_score(y_test,y_predict)    # Checking F1_Score

0.24074074074074073

In [21]:
print(classification_report(y_test,y_predict))              # it will give precision,recall,f1 scores and accuracy

              precision    recall  f1-score   support

           0       0.89      0.85      0.87       320
           1       0.22      0.27      0.24        48

    accuracy                           0.78       368
   macro avg       0.55      0.56      0.56       368
weighted avg       0.80      0.78      0.79       368



In [22]:
y_predict_train=dt.predict(processed_X_train)    # predicting the model
acc=accuracy_score(y_train,y_predict_train)
acc

1.0

# Hyperparameter Tuning

In [23]:
from sklearn.model_selection import GridSearchCV

In [25]:
#creating dictionary--> key value pair of hyperparameters having key as parameter and values as its values
params = {
    "criterion":("gini", "entropy"), #quality of split
    "splitter":("best", "random"), # searches the features for a split
    "max_depth":(list(range(1, 10))), #depth of tree range from 1 to 19
    "min_samples_split":[2, 3, 4,5,6,7],    #the minimum number of samples required to split internal node
    "min_samples_leaf":list(range(1, 10)),#minimum number of samples required to be at a leaf node,we are passing list which is range from 1 to 19
}


tree_clf = DecisionTreeClassifier()                # object creation for decision tree with random state 3
tree_cv = GridSearchCV(tree_clf, params, scoring="f1", n_jobs=-1, verbose=2, cv=5)
#passing model to gridsearchCV ,
#tree_clf-->model
#params---->hyperparametes(dictionary we created)
#scoring--->performance matrix to check performance
#n_jobs---->Number of jobs to run in parallel,-1 means using all processors.
#verbose=Controls the verbosity: the higher, the more messages.
#>1 : the computation time for each fold and parameter candidate is displayed;
#>2 : the score is also displayed;
#>3 : the fold and candidate parameter indexes are also displayed together with the starting time of the computation.
#cv------> number of flods




tree_cv.fit(processed_X_train,y_train,sample_weight=sample_weights)    # training data on gridsearch cv
best_params = tree_cv.best_params_    # it will give you best parameters
print(f"Best paramters: {best_params})")   # printing  best parameters



Fitting 5 folds for each of 1944 candidates, totalling 9720 fits
Best paramters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 7, 'min_samples_split': 6, 'splitter': 'random'})


In [29]:
tree_cv.best_params_    # getting best parameters from cv

{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 7,
 'min_samples_split': 6,
 'splitter': 'random'}

In [30]:
# passing best parameter to decision tree
dt1=DecisionTreeClassifier(criterion='gini',max_depth=3,min_samples_leaf= 1,min_samples_split=2,splitter='best')

In [31]:
# training model with best parameter
dt1.fit(processed_X_train,y_train,sample_weight=sample_weights)

In [32]:
y_hat1=dt1.predict(processed_X_test) # predicting the model
y_hat1

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,

In [33]:
accuracy_score(y_test,y_hat1)     # Checking accuracy of model

0.8043478260869565

In [34]:
y_hat2=dt1.predict(processed_X_train)

In [42]:
f1_score(y_train,y_hat2)

0.5377777777777778

In [36]:
print(classification_report(y_test,y_hat1)) # it will give precision,recall,f1 scores and accuracy

              precision    recall  f1-score   support

           0       0.91      0.86      0.88       320
           1       0.32      0.44      0.37        48

    accuracy                           0.80       368
   macro avg       0.61      0.65      0.63       368
weighted avg       0.83      0.80      0.82       368



**Bagging**

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

KNN = KNeighborsClassifier(n_neighbors = 7)

model_bagg = BaggingClassifier(estimator = KNN, n_estimators = 20)

model_bagg.fit(processed_X_train, y_train)
y_hat_bagg = model_bagg.predict(processed_X_test)

In [39]:
accuracy_score(y_test,y_hat_bagg) 

0.8695652173913043

In [43]:
f1_score(y_test,y_hat_bagg)

0.1111111111111111

**Random Forest**

In [44]:
from sklearn.ensemble import RandomForestClassifier   # importing randomforest

rf_clf = RandomForestClassifier() # Assigning RandomForest CLassifier into variable
rf_clf.fit(processed_X_train,y_train,sample_weight=sample_weights)   # training the data

In [45]:
y_predict=rf_clf.predict(processed_X_test)   # testing the model

In [46]:
accuracy_score(y_test,y_predict) # Checking the accuracy

0.8804347826086957

In [47]:
f1_score(y_test,y_predict)    # Checking the F1_Score

0.15384615384615385

In [48]:
print(classification_report(y_test,y_predict))   # it will give precision,recall,f1 scores and accuracy

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       320
           1       1.00      0.08      0.15        48

    accuracy                           0.88       368
   macro avg       0.94      0.54      0.54       368
weighted avg       0.89      0.88      0.83       368



**Gradient Boosting**

In [49]:
from sklearn.ensemble import GradientBoostingClassifier  # Importing GradientBoostingClassifier
gbm=GradientBoostingClassifier() ## object creation
gbm.fit(processed_X_train,y_train,sample_weight=sample_weights) ## fitting the data

In [50]:
processed_x_test = preprocessor.transform(X_test)   # Transform the test data using the preprocessor

In [51]:
y_gbm=gbm.predict(processed_X_test)     # Getting predictions from model
y_gbm

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [52]:
print(accuracy_score(y_test,y_gbm))  

0.8532608695652174


In [53]:
print(f1_score(y_test,y_gbm))   # Checking F1 Score

0.5


In [54]:
print(classification_report(y_test,y_gbm))   # Displaying Classification Report

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       320
           1       0.45      0.56      0.50        48

    accuracy                           0.85       368
   macro avg       0.69      0.73      0.71       368
weighted avg       0.87      0.85      0.86       368



**XGBoost**

In [55]:
%pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/150.0 MB 8.5 MB/s eta 0:00:18
    --------------------------------------- 2.6/150.0 MB 8.4 MB/s eta 0:00:18
   - -------------------------------------- 4.7/150.0 MB 8.9 MB/s eta 0:00:17
   - -------------------------------------- 7.3/150.0 MB 9.9 MB/s eta 0:00:15
   -- ------------------------------------- 9.7/150.0 MB 10.4 MB/s eta 0:00:14
   --- ------------------------------------ 12.6/150.0 MB 11.0 MB/s eta 0:00:13
   ---- ----------------------------------- 15.2/150.0 MB 11.1 MB/s eta 0:00:13
   ---- ----------------------------------- 17.6/150.0 MB 11.2 MB/s eta 0:00:12
   ----- ---------------------------------- 19.9/150.0 MB 11.1 MB/s eta 0:00:12
   -


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [57]:
## model creation
from xgboost import XGBClassifier#importing the model library
xgb_r=XGBClassifier() ## object creation
xgb_r.fit(processed_X_train,y_train,sample_weight=sample_weights)# fitting the data
y_hat=xgb_r.predict(processed_x_test)#predicting the price

In [58]:
print(accuracy_score(y_hat,y_test))   # Checking Accuracy Score

0.8777173913043478


In [59]:
print(f1_score(y_test,y_hat))   # Checking F1Score

0.4


In [60]:
print(classification_report(y_test,y_hat))  # Checking Classification report

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       320
           1       0.56      0.31      0.40        48

    accuracy                           0.88       368
   macro avg       0.73      0.64      0.67       368
weighted avg       0.86      0.88      0.86       368

