In [72]:
import catboost
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost
from xgboost import XGBClassifier, XGBRFRegressor
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [73]:
from ucimlrepo import fetch_ucirepo 
  
productivity_prediction_of_garment_employees = fetch_ucirepo(id=597) 

X = productivity_prediction_of_garment_employees.data.features 
y = productivity_prediction_of_garment_employees.data.targets 

In [74]:
X = X.drop(columns='wip')
X = X.drop(columns='date')

In [75]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform='pandas')
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')

In [78]:
# Using LightGBoost
lgbm = LGBMRegressor(random_state=24)
pipe = Pipeline([('CT',ct),('LGBM',lgbm)])
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
params= {'LGBM__max_depth':[2,3,4],
         'LGBM__n_estimators':[10,50],
         'LGBM__learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(pipe, param_grid=params,cv =kfold,scoring='r2', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END LGBM__learning_rate=0.001, LGBM__max_depth=2, LGBM__n_estimators=10;, score=0.005 total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 20
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END LGBM__learning_rate=0.001, LGBM__max_depth=2, LGBM__n_estimators=10;, score=-0.006 total time=   0.0s
[LightGBM] [Info] Auto-choosing

In [79]:
print("Score :",gcv.best_score_)
print("Params :",gcv.best_params_)

Score : 0.49776817407312857
Params : {'LGBM__learning_rate': 0.25075, 'LGBM__max_depth': 4, 'LGBM__n_estimators': 50}


In [80]:
# Using the XGBoost
xgbm = XGBRFRegressor(random_state=24, verbose=False)
pipe = Pipeline([('CT',ct),('XGBM',xgbm)])
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
params= {'XGBM__max_depth':[2,3,4],
         'XGBM__n_estimators':[10,50],
         'XGBM__learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(pipe, param_grid=params,cv =kfold, scoring='r2', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=10;, score=0.000 total time=   0.0s
[CV 2/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=10;, score=-0.010 total time=   0.0s
[CV 3/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=10;, score=-0.000 total time=   0.0s
[CV 4/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=10;, score=-0.006 total time=   0.0s
[CV 5/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=10;, score=-0.000 total time=   0.0s
[CV 1/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=50;, score=0.000 total time=   0.0s
[CV 2/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=50;, score=-0.010 total time=   0.0s
[CV 3/5] END XGBM__learning_rate=0.001, XGBM__max_depth=2, XGBM__n_estimators=50;, score=-0.000 total time=   0.0s
[CV 4/5] END XGBM__l

In [81]:
print("Score :",gcv.best_score_)
print("Params :",gcv.best_params_)

Score : 0.42113299369812013
Params : {'XGBM__learning_rate': 1.0, 'XGBM__max_depth': 4, 'XGBM__n_estimators': 50}


In [82]:
# Using CatBoost
cbc = CatBoostRegressor(random_state=24, verbose=False)
pipe = Pipeline([('CT',ct),('CBC',cbc)])
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
params= {'CBC__max_depth':[2,3,4],'CBC__n_estimators':[10,50],'CBC__learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(pipe, param_grid=params,cv =kfold, scoring='r2', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=10;, score=0.004 total time=   0.0s
[CV 2/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=10;, score=-0.007 total time=   0.0s
[CV 3/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=10;, score=0.003 total time=   0.0s
[CV 4/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=10;, score=-0.002 total time=   0.0s
[CV 5/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=10;, score=0.002 total time=   0.0s
[CV 1/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=50;, score=0.019 total time=   0.0s
[CV 2/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=50;, score=0.007 total time=   0.0s
[CV 3/5] END CBC__learning_rate=0.001, CBC__max_depth=2, CBC__n_estimators=50;, score=0.018 total time=   0.0s
[CV 4/5] END CBC__learning_rate=0.001, CBC__max_

In [83]:
print("Score :",gcv.best_score_)
print("Params :",gcv.best_params_)

Score : 0.49363156435038286
Params : {'CBC__learning_rate': 0.5005, 'CBC__max_depth': 2, 'CBC__n_estimators': 50}


In [84]:
# Using CatBoost without Onehotencoding
cbc = CatBoostRegressor(random_state=24, 
                        cat_features = list(X.columns[X.dtypes==object]),
                        verbose=False)
kfold = KFold(shuffle=True, n_splits=5, random_state=24)
params= {'max_depth':[2,3,4],'n_estimators':[10,50],'learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(cbc, param_grid=params,cv =kfold, scoring='r2', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.004 total time=   0.1s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=-0.007 total time=   0.1s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.003 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=-0.002 total time=   0.1s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.002 total time=   0.1s
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.020 total time=   0.6s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.007 total time=   0.7s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.018 total time=   0.7s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.012 total time=   0.7s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0

In [85]:
print("Score :",gcv.best_score_)
print("Params :",gcv.best_params_)

Score : 0.49768646968713837
Params : {'learning_rate': 0.5005, 'max_depth': 4, 'n_estimators': 50}
