Importing The Necessary Libraries

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

Loading The Dataset

In [66]:
# Load the dataset
data = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')

Displaying The Head Of The Dataset (To See What we are Working With)

In [67]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


Feature Selection

In [68]:
# Select relevant features
features = [
    'PerformanceRating', 'MonthlyRate', 'StandardHours', 'JobInvolvement',
    'EducationField', 'JobSatisfaction', 'Education', 'YearsAtCompany', 'OverTime'
]
X = data[features]
y = data['Attrition']  # Assuming 'Attrition' is the target variable we want to predict promotion or not

 Handling Missing Values

In [69]:
# Replace missing values with the most frequent value for categorical features and the mean for numerical features
numeric_features = ['PerformanceRating', 'MonthlyRate', 'StandardHours', 'JobInvolvement', 'JobSatisfaction', 'Education', 'YearsAtCompany']
categorical_features = ['EducationField', 'OverTime']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))])

Preprocessing

In [70]:
# Preprocess categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Splitting The Dataset

In [71]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initializing and Training the Gradient Boosting Machine

In [72]:
# Initialize the GBM model
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Pipeline for GBM
gbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('gbm', gbm)])

# Train GBM model
gbm_pipeline.fit(X_train, y_train)

Generating Predictions From GBM

In [73]:
# Generate GBM predictions
gbm_predictions_train = gbm_pipeline.predict_proba(X_train)[:, 1].reshape(-1, 1)
gbm_predictions_test = gbm_pipeline.predict_proba(X_test)[:, 1].reshape(-1, 1)

 Initializing and Training The Neural Network

In [74]:
# Neural Network using GBM outputs
nn = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)

# Train the NN model on GBM outputs
nn.fit(gbm_predictions_train, y_train)

Making Final Predictions Using The Hybrid Model

In [75]:
# Final predictions using the hybrid model
final_predictions = nn.predict(gbm_predictions_test)

In [76]:
# Evaluate the model
accuracy = accuracy_score(y_test, final_predictions)
report = classification_report(y_test, final_predictions)
print(f"Hybrid Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Hybrid Model Accuracy: 0.81
Classification Report:
               precision    recall  f1-score   support

          No       0.89      0.89      0.89       255
         Yes       0.29      0.28      0.29        39

    accuracy                           0.81       294
   macro avg       0.59      0.59      0.59       294
weighted avg       0.81      0.81      0.81       294

