In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib

In [3]:
df = pd.read_csv("./sample_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df.shape

(1470, 35)

In [5]:
features = [
    "Age",
    "MonthlyIncome",
    "JobLevel",
    "JobSatisfaction",
    "WorkLifeBalance",
    "YearsAtCompany",
    "OverTime",
    "DistanceFromHome"
]

target = "Attrition"

df = df[features + [target]]
df.head()


Unnamed: 0,Age,MonthlyIncome,JobLevel,JobSatisfaction,WorkLifeBalance,YearsAtCompany,OverTime,DistanceFromHome,Attrition
0,41,5993,2,4,1,6,Yes,1,Yes
1,49,5130,2,2,3,10,No,8,No
2,37,2090,1,3,3,0,Yes,2,Yes
3,33,2909,1,3,3,8,Yes,3,No
4,27,3468,1,2,3,2,No,2,No


In [6]:
le = LabelEncoder()

df["OverTime"] = le.fit_transform(df["OverTime"])
df["Attrition"] = le.fit_transform(df["Attrition"])

df.head()


Unnamed: 0,Age,MonthlyIncome,JobLevel,JobSatisfaction,WorkLifeBalance,YearsAtCompany,OverTime,DistanceFromHome,Attrition
0,41,5993,2,4,1,6,1,1,1
1,49,5130,2,2,3,10,0,8,0
2,37,2090,1,3,3,0,1,2,1
3,33,2909,1,3,3,8,1,3,0
4,27,3468,1,2,3,2,0,2,0


In [7]:
X = df.drop("Attrition", axis=1)
y = df["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.8605442176870748
              precision    recall  f1-score   support

           0       0.87      0.99      0.92       255
           1       0.25      0.03      0.05        39

    accuracy                           0.86       294
   macro avg       0.56      0.51      0.49       294
weighted avg       0.79      0.86      0.81       294



In [10]:
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.8435374149659864
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       255
           1       0.23      0.08      0.12        39

    accuracy                           0.84       294
   macro avg       0.55      0.52      0.51       294
weighted avg       0.79      0.84      0.81       294



In [11]:
joblib.dump(log_reg, "logistic_model.pkl")
joblib.dump(dt, "decision_tree_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [12]:
# Example: one employee's data
sample_employee = {
    "Age": 35,
    "MonthlyIncome": 6000,
    "JobLevel": 2,
    "JobSatisfaction": 3,
    "WorkLifeBalance": 3,
    "YearsAtCompany": 5,
    "OverTime": 1,  # Yes = 1, No = 0
    "DistanceFromHome": 10
}

sample_df = pd.DataFrame([sample_employee])

# Scale for logistic regression
sample_scaled = scaler.transform(sample_df)

# Predictions
lr_prediction = log_reg.predict(sample_scaled)
dt_prediction = dt.predict(sample_df)

print("Logistic Regression Prediction:", lr_prediction)
print("Decision Tree Prediction:", dt_prediction)


Logistic Regression Prediction: [0]
Decision Tree Prediction: [0]


In [13]:
import os
os.listdir()

['.config',
 'logistic_model.pkl',
 'decision_tree_model.pkl',
 'sample_data',
 'scaler.pkl']