In [38]:
import os
import numpy as np
import pandas as pd

os.chdir('../core')

In [43]:
DATA_DIR = "../data"
FILE_NAME = "train.csv"

df = pd.read_csv(os.path.join(DATA_DIR, FILE_NAME))
df = df.drop(columns=["Employee ID"])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df["Attrition"] = df["Attrition"].map({"Stayed": 1, "Left": 0})
print(df.shape)

df.head(5)

(59598, 23)


Unnamed: 0,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,29,Female,10,Education,4262,Fair,Medium,High,0,Yes,...,0,Senior,Small,52,No,No,No,Poor,High,1
1,18,Male,3,Finance,9032,Good,Very High,High,3,Yes,...,3,Entry,Medium,50,No,No,No,Good,Medium,0
2,30,Female,9,Healthcare,8610,Poor,Medium,Average,2,No,...,4,Entry,Large,62,No,No,No,Fair,Low,0
3,46,Male,2,Education,5071,Fair,High,Average,1,Yes,...,0,Entry,Small,39,No,No,No,Good,High,0
4,37,Male,17,Technology,10037,Fair,High,Average,0,No,...,1,Mid,Medium,42,No,No,No,Good,Low,0


In [44]:
df.isnull().sum()

Age                         0
Gender                      0
Years at Company            0
Job Role                    0
Monthly Income              0
Work-Life Balance           0
Job Satisfaction            0
Performance Rating          0
Number of Promotions        0
Overtime                    0
Distance from Home          0
Education Level             0
Marital Status              0
Number of Dependents        0
Job Level                   0
Company Size                0
Company Tenure              0
Remote Work                 0
Leadership Opportunities    0
Innovation Opportunities    0
Company Reputation          0
Employee Recognition        0
Attrition                   0
dtype: int64

In [45]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}
    
    def fit(self, X, y=None):
        for column in X.columns:
            le = LabelEncoder()
            le.fit(X[column])
            self.label_encoders[column] = le
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for column in X.columns:
            le = self.label_encoders[column]
            X_transformed[column] = le.transform(X[column])
        return X_transformed
    
    def inverse_transform(self, X):
        X_inverse_transformed = X.copy()
        for column in X.columns:
            le = self.label_encoders[column]
            X_inverse_transformed[column] = le.inverse_transform(X[column])
        return X_inverse_transformed


target_column = "Attrition"

# Choose numeric columns except the target column
numeric_columns = df.drop(columns=[target_column]).select_dtypes(include=['int64', 'float64']).columns.tolist()
category_columns = df.select_dtypes(include=['object']).columns.tolist()

processor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_columns),
        ('cat', LabelEncoderTransformer(), category_columns)
    ]
)

X = processor.fit_transform(df.drop(columns=[target_column]), df[target_column])
y = df[target_column].values.reshape(-1, 1)

X

array([[0.26829268, 0.18      , 0.1986112 , ..., 0.        , 3.        ,
        0.        ],
       [0.        , 0.04      , 0.52019146, ..., 0.        , 2.        ,
        2.        ],
       [0.29268293, 0.16      , 0.49174139, ..., 0.        , 1.        ,
        1.        ],
       ...,
       [0.51219512, 0.02      , 0.23582552, ..., 0.        , 2.        ,
        1.        ],
       [0.24390244, 0.3       , 0.6059462 , ..., 0.        , 2.        ,
        2.        ],
       [0.43902439, 0.44      , 0.20339783, ..., 0.        , 2.        ,
        2.        ]])

In [54]:
from model import LogisticRegression
from time import time

lr = LogisticRegression(learning_rate=0.05, num_iterations=50, solver='gradient-descent', log=True)

start = time()
lr.fit(X, y)
end = time()

print(f"Elapsed time: {end - start}")

Elapsed time: 0.32703590393066406


In [55]:
y_pred = lr.predict(X)

from sklearn.metrics import classification_report

report = classification_report(y, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.70      0.57      0.63     28338
           1       0.67      0.78      0.72     31260

    accuracy                           0.68     59598
   macro avg       0.68      0.68      0.67     59598
weighted avg       0.68      0.68      0.68     59598



In [56]:
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression
from time import time

lr = SklearnLogisticRegression(verbose=0)

start = time()
lr.fit(X, y)
end = time()

print(f"Elapsed time: {end - start}")

  y = column_or_1d(y, warn=True)


Elapsed time: 0.33415985107421875


In [50]:
y_pred = lr.predict(X)

from sklearn.metrics import classification_report

report = classification_report(y, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.70      0.69      0.70     28338
           1       0.72      0.73      0.73     31260

    accuracy                           0.71     59598
   macro avg       0.71      0.71      0.71     59598
weighted avg       0.71      0.71      0.71     59598

