In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score


## Load and Inspect the Dataset

This step loads the dataset and quickly checks:
- The number of rows and columns  
- Data types and structure  
- Missing values  
- Class distribution of the target variable (**Attrition**)

This helps understand the data and detect class imbalance early.
.


In [2]:
df=pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition[1].csv")
print(df.shape)
print(df.info())
print(df.isnull().sum())
print(df['Attrition'].value_counts())

(1470, 35)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel     

## Get Categorical Columns

List all categorical columns to prepare them for encoding.


In [3]:
cate_columns=df.select_dtypes(include=["object"]).columns.tolist()
print(cate_columns)

['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']


## Explore Unique Values of Categorical Columns

Check the unique values in each categorical column to understand categories and plan encoding.


In [4]:
print(df["BusinessTravel"].unique())
print(df["Department"].unique())
print(df["EducationField"].unique())
print(df["Gender"].unique())
print(df["JobRole"].unique())
print(df["Over18"].unique())
print(df["OverTime"].unique())
print(df["MaritalStatus"].unique())


['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
['Sales' 'Research & Development' 'Human Resources']
['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
['Female' 'Male']
['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
['Y']
['Yes' 'No']
['Single' 'Married' 'Divorced']


## Data Preprocessing and Train-Test Split

- Encode binary categorical columns (`Gender`, `OverTime`, `Attrition`)  
- Apply one-hot encoding to multi-category columns  
- Drop unnecessary columns (`Over18`)  
- Scale features using `StandardScaler`  
- Split data into training and testing sets (75% train, 25% test)  


In [5]:
df['Gender']=df['Gender'].map({"Male":1,"Female":0})
df['OverTime']=df['OverTime'].map({"Yes":1,"No":0})
df['Attrition']=df['Attrition'].map({"Yes":1,"No":0})

df=pd.get_dummies(df,columns=['BusinessTravel', 'Department', 'EducationField','JobRole','MaritalStatus'],drop_first=True)

df = df.drop('Over18', axis=1)
x=df.drop('Attrition',axis=1)
y=df['Attrition']

scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)

x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.25,random_state=42)

## Hyperparameter Tuning with GridSearchCV

Define a grid of hyperparameters for the Random Forest model and use GridSearchCV with 5-fold cross-validation to find the best combination. This helps improve model performance by selecting optimal settings.


In [7]:

# Hyperparameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],         # original: 'entropy'
    'n_estimators': [100, 200, 300, 400],    # original: 300
    'max_depth': [5, 10, 15, None],          # original: 10
    'min_samples_leaf': [1, 2, 4],           # original: 1
    'min_samples_split': [2, 5, 10],         # original: 5
    'max_features': ['sqrt', 'log2', None],  # original: 'log2'
    'bootstrap': [True, False],              # original: True
    'class_weight': [None, 'balanced'],      # original: None
    'oob_score': [True, False]               # original: True
}
# GridSearchCV
grid = GridSearchCV( param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

print("Best Params:", grid.best_params_)
print("Best CV Score:", grid.best_score_)


'\n# Hyperparameter grid\nparam_grid = {\n    \'model__criterion\': [\'gini\', \'entropy\'],         # original: \'entropy\'\n    \'model__n_estimators\': [100, 200, 300, 400],    # original: 300\n    \'model__max_depth\': [5, 10, 15, None],          # original: 10\n    \'model__min_samples_leaf\': [1, 2, 4],           # original: 1\n    \'model__min_samples_split\': [2, 5, 10],         # original: 5\n    \'model__max_features\': [\'sqrt\', \'log2\', None],  # original: \'log2\'\n    \'model__bootstrap\': [True, False],              # original: True\n    \'model__class_weight\': [None, \'balanced\'],      # original: None\n    \'model__oob_score\': [True, False]               # original: True\n}\n# GridSearchCV\ngrid = GridSearchCV(pipeline, param_grid, cv=5, scoring=\'accuracy\', n_jobs=-1)\ngrid.fit(x_train, y_train)\n\nprint("Best Params:", grid.best_params_)\nprint("Best CV Score:", grid.best_score_)\n'

## Train Random Forest and Evaluate

- Train a Random Forest classifier with selected hyperparameters and class weighting for imbalance  
- Make predictions using a probability threshold of 0.42  
- Evaluate model performance with accuracy, confusion matrix, and classification report


In [6]:
model = RandomForestClassifier(
    criterion='gini',
    n_estimators=150,
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight={0:1, 1:4},
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

model.fit(x_train, y_train)

y_prob = model.predict_proba(x_test)[:, 1]
y_pred = (y_prob >= 0.42).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8125
Confusion Matrix:
 [[274  46]
 [ 23  25]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.86      0.89       320
           1       0.35      0.52      0.42        48

    accuracy                           0.81       368
   macro avg       0.64      0.69      0.65       368
weighted avg       0.85      0.81      0.83       368



## Check Train-Test Accuracy and Gap

- Compute training and testing accuracy  
- Calculate the gap between train and test accuracy to assess overfitting or underfitting


In [7]:

print("Train accuracy:", model.score(x_train,y_train))
print("Test accuracy:", model.score(x_test, y_test))
gap = model.score(x_train, y_train) - model.score(x_test, y_test)
print(f"Gap: {gap:.2f}")

Train accuracy: 0.9083484573502723
Test accuracy: 0.8478260869565217
Gap: 0.06


## Cross-Validation with F1 Score

Perform 5-fold cross-validation on the training set using F1 score to evaluate model stability and performance across different folds.


In [8]:
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='f1')
print("CV F1 scores:", scores)
print("Mean F1:", scores.mean())


CV F1 scores: [0.45901639 0.53731343 0.40677966 0.45070423 0.5       ]
Mean F1: 0.47076274252950123


### Sample Prediction â€“ Employee Likely to Quit

This cell demonstrates predicting attrition for a single employee using the trained Random Forest model.

- We define a sample employee with features that increase attrition risk (e.g., low satisfaction, high overtime, short tenure).  
- The sample is converted to a DataFrame, aligned with training features, and scaled using the same `StandardScaler`.  
- The model predicts the probability of leaving and the final class (1 = likely to quit, 0 = likely to stay).

**Example Output:**


In [17]:
sample_quit_employee = {
    'Age': 28,
    'DailyRate': 200,
    'DistanceFromHome': 20,
    'Education': 2,
    'EnvironmentSatisfaction': 1,  # low satisfaction
    'Gender': 0,                    # Female
    'HourlyRate': 30,
    'JobInvolvement': 1,            # low involvement
    'JobLevel': 1,
    'JobSatisfaction': 1,           # very low satisfaction
    'MonthlyIncome': 3000,
    'MonthlyRate': 12000,
    'NumCompaniesWorked': 4,
    'OverTime': 1,                   # yes
    'PercentSalaryHike': 10,
    'PerformanceRating': 3,
    'RelationshipSatisfaction': 1,  # low
    'StandardHours': 40,
    'StockOptionLevel': 0,
    'TotalWorkingYears': 3,
    'TrainingTimesLastYear': 1,
    'WorkLifeBalance': 1,            # poor work-life
    'YearsAtCompany': 1,
    'YearsInCurrentRole': 1,
    'YearsSinceLastPromotion': 0,
    'YearsWithCurrManager': 1,
    # One-hot columns
    'BusinessTravel_Travel_Frequently': 1,
    'BusinessTravel_Travel_Rarely': 0,
    'Department_Research & Development': 0,
    'Department_Sales': 1,
    'EducationField_Life Sciences': 0,
    'EducationField_Marketing': 0,
    'EducationField_Medical': 0,
    'EducationField_Other': 1,
    'EducationField_Technical Degree': 0,
    'EducationField_Human Resources': 0,
    'JobRole_Healthcare Representative': 1,
    'JobRole_Research Scientist': 0,
    'JobRole_Laboratory Technician': 0,
    'JobRole_Manufacturing Director': 0,
    'JobRole_Manager': 0,
    'JobRole_Sales Executive': 0,
    'JobRole_Sales Representative': 1,
    'JobRole_Research Director': 0,
    'JobRole_Human Resources': 0,
    'MaritalStatus_Married': 0,
    'MaritalStatus_Single': 1
}

# Convert to DataFrame and align columns
sample_quit_df = pd.DataFrame([sample_quit_employee])
sample_quit_df = sample_quit_df.reindex(columns=x.columns, fill_value=0)

# Scale and predict
sample_quit_scaled = scaler.transform(sample_quit_df)
sample_quit_prob = model.predict_proba(sample_quit_scaled)[:, 1]
sample_quit_pred = (sample_quit_prob >= 0.42).astype(int)

# Print result
if sample_quit_pred[0] == 1:
    print("This employee will likely quit.")
else:
    print("This employee will likely stay.")
print(f"Predicted probability of leaving: {sample_quit_prob[0]:.2f}")


This employee will likely quit.
Predicted probability of leaving: 0.70
