In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
import os 
warnings.filterwarnings('ignore')

In [7]:

hr = pd.read_csv("HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']

In [8]:

X_train, X_test, y_train, y_test = train_test_split(X, y, 
        random_state=24, test_size=0.3, stratify=y)

In [9]:

ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')
lr = LogisticRegression(random_state=24)
pipe = Pipeline([('CT',ct),('SCL',None),('LR',lr)])

pipe.fit(X_train, y_train)
y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.43644541058582403


In [10]:
#### K-FOLDS with ROC AUC

kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
params = {'LR__solver':['lbfgs','liblinear',
          'newton-cg','newton-cholesky',
          'sag','saga'],
          'LR__C':np.linspace(0.001, 10, 20),
          'SCL':[scaler_mm, scaler_std, None]}
# Setting the parameter roc_auc in scoring
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='roc_auc',          
                   cv=kfold, verbose=3)
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)


Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV 1/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=0.767 total time=   0.0s
[CV 2/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=0.746 total time=   0.0s
[CV 3/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=0.773 total time=   0.0s
[CV 4/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=0.767 total time=   0.0s
[CV 5/5] END LR__C=0.001, LR__solver=lbfgs, SCL=MinMaxScaler();, score=0.781 total time=   0.0s
[CV 1/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=0.814 total time=   0.0s
[CV 2/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=0.800 total time=   0.0s
[CV 3/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=0.814 total time=   0.0s
[CV 4/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler();, score=0.808 total time=   0.0s
[CV 5/5] END LR__C=0.001, LR__solver=lbfgs, SCL=StandardScaler()

In [11]:
# through log_loss function

kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
params = {'LR__solver':['lbfgs','liblinear',
          'newton-cg','newton-cholesky',
          'sag','saga'],
          'LR__C':np.linspace(0.001, 10, 20),
          'SCL':[scaler_mm, scaler_std, None]}
# Setting the parameter neg_log_loss in scoring
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='neg_log_loss',          
                   cv=kfold)
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)


Best Parameters:  {'LR__C': np.float64(3.158578947368421), 'LR__solver': 'lbfgs', 'SCL': MinMaxScaler()}
Best Score:  -0.42981139278142005
(360, 16)
