In [1]:
try:
    import pandas as pd
    import numpy as np
    
    import matplotlib.pyplot as plt
    import plotly.express as px
    import seaborn as sns
    import plotly.graph_objects as go
    import plotly.figure_factory as ff

    from sklearn.model_selection import train_test_split
    from sklearn import svm
    from sklearn import metrics
    from sklearn import preprocessing
    from sklearn.ensemble import RandomForestRegressor
    
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
    
    from os import path
    from sys import exit
    import math
except:
    !pip install ipywidgets matplotlib numpy pandas plotly scikit-learn seaborn scipy ipykernel jupyterlab kagglehub dash scikit-learn

# fetch Dataset

In [2]:
class DatasetLoader:
    def __init__(self):
        self.kaggle_url = "thedevastator/employee-attrition-and-factors"
        
        self.file_name = "HR_Analytics.csv.csv"
        self.dir_name = "dataset"
        
        self.file_path = path.join("..", self.dir_name, self.file_name)
        self.github_url = f'https://raw.githubusercontent.com/AbrarShakhi/employee-attrition-predictor/main/{self.dir_name}/{self.file_name}'
    
    def load_from_kaggle(self):
        try:
            return kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, self.kaggle_url, self.file_name)
        except:
            return None

    def load_from_github(self):
        try:
            return pd.read_csv(self.github_url)
        except:
            return None

    def load_from_local(self):
        try:
            return pd.read_csv(self.file_path)
        except:
            return None

loader = DatasetLoader()
df = loader.load_from_local()
if df is None:
    df = loader.load_from_github()
if df is None:
    df = loader.load_from_kaggle()
if df is None:
    print("Unable to find dataset..")
    exit(1)

df.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# datasset Analysing

In [3]:
df = df.dropna()
target_column = "Attrition"

columns=[col for col in df.columns if len(df[col].unique()) == 1]
df = df.drop(columns=columns, axis=1)

df = df.drop(columns=["EmployeeNumber"])
print(columns)
df.head(2)

['EmployeeCount', 'Over18', 'StandardHours']


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7


In [4]:
df[target_column] = df[target_column].map({'Yes': 1, 'No': 0})

for col in df.columns:
    df[col] = df[col].astype("category").cat.codes

In [5]:
X = df.drop('Attrition', axis=1)  # Features
y = df['Attrition']               # Lebels
X = pd.get_dummies(X, drop_first=True)

# Split dataset

In [6]:
rseed = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=rseed, stratify=y
)

# Preprocessing

In [7]:
scaler = preprocessing.StandardScaler()
# scaler = preprocessing.MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Defining train_predict function
It will train the `model` and predicts `y`

**params:** model

**returns:** y_prediction

In [8]:
def train_predict(model):
    # train the model
    model.fit(X_train, y_train)

    # predict y from X test
    y_pred = model.predict(X_test)

    return y_pred

# Defining train_predict function
It will take `kernel` name as string and `y_pred`, and print `accuracy`, `confusion matrix`, `precision`, `recall`, `f1-score`, `support`

**params:** kernel name as str

**returns:** None

In [9]:
def show_result(kernel, y_pred):
    print(f"Accuracy Score {kernel}:", metrics.accuracy_score(y_test, y_pred))

    print(f"\nConfusion Matrix {kernel}:")
    print(metrics.confusion_matrix(y_test, y_pred))

    print(f"\nClassification Report {kernel}:")
    print(metrics.classification_report(y_test, y_pred))

# Using RBF

In [10]:
model = svm.SVC(kernel='rbf', C=17.0, gamma='scale', random_state=rseed)
y_pred = train_predict(model)
show_result("rbf", y_pred)

Accuracy Score rbf: 0.8537414965986394

Confusion Matrix rbf:
[[235  12]
 [ 31  16]]

Classification Report rbf:
              precision    recall  f1-score   support

           0       0.88      0.95      0.92       247
           1       0.57      0.34      0.43        47

    accuracy                           0.85       294
   macro avg       0.73      0.65      0.67       294
weighted avg       0.83      0.85      0.84       294



# Using Poly

In [11]:
model = svm.SVC(kernel='poly', degree=3, random_state=rseed)
y_pred = train_predict(model)
show_result("poly", y_pred)

Accuracy Score poly: 0.8503401360544217

Confusion Matrix poly:
[[245   2]
 [ 42   5]]

Classification Report poly:
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       247
           1       0.71      0.11      0.19        47

    accuracy                           0.85       294
   macro avg       0.78      0.55      0.55       294
weighted avg       0.83      0.85      0.80       294



# Using Linear With l1 penalty

In [12]:
model = svm.LinearSVC(penalty='l1', loss='squared_hinge', dual=False, max_iter=1000000000, random_state=rseed)
y_pred = train_predict(model)
show_result("linear l1", y_pred)

Accuracy Score linear l1: 0.8877551020408163

Confusion Matrix linear l1:
[[244   3]
 [ 30  17]]

Classification Report linear l1:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94       247
           1       0.85      0.36      0.51        47

    accuracy                           0.89       294
   macro avg       0.87      0.67      0.72       294
weighted avg       0.88      0.89      0.87       294



# Using Linear With l2 penalty

In [13]:
model = svm.LinearSVC(penalty='l2', loss='squared_hinge', max_iter=1000000000, random_state=rseed)
y_pred = train_predict(model)
show_result("linear l2", y_pred)

Accuracy Score linear l2: 0.8877551020408163

Confusion Matrix linear l2:
[[244   3]
 [ 30  17]]

Classification Report linear l2:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94       247
           1       0.85      0.36      0.51        47

    accuracy                           0.89       294
   macro avg       0.87      0.67      0.72       294
weighted avg       0.88      0.89      0.87       294

