In [25]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LogisticRegression, ElasticNet, LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, VotingRegressor, BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [5]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
productivity_prediction_of_garment_employees = fetch_ucirepo(id=597) 
  
# data (as pandas dataframes) 
x = productivity_prediction_of_garment_employees.data.features 
y = productivity_prediction_of_garment_employees.data.targets 
  



In [None]:
x = x.drop("wip", axis = 1)

In [32]:

x = x.set_index("date")

In [34]:
x.head(3)

Unnamed: 0_level_0,quarter,department,day,team,targeted_productivity,smv,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,7080,98,0.0,0,0,59.0
1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,960,0,0.0,0,0,8.0
1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,3660,50,0.0,0,0,30.5


# XGBoost

In [40]:


# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=['category', object])),
    (ohe, make_column_selector(dtype_include=['category', object])),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

xgbm = XGBRegressor()

pipe = Pipeline([("OHE", trans_ohe), ("SCL", scl_mm), ("XGB", xgbm)])


# _____________________________________________________________________________________
# GCV

params = {
    "XGB__n_estimators" : [10, 20, 30, 40, 50],
    "XGB__max_depth" : [2, 3, 4],
    "XGB__learning_rate" : np.linspace(0.001, 1, 5)
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "r2",
                  cv = kfold,
                  verbose = 3)


# gcv.fit(x, y)

In [42]:
gcv.fit(x, y)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=35;, score=0.018 total time=   0.0s
[CV 2/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=35;, score=0.004 total time=   0.0s
[CV 3/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=35;, score=0.017 total time=   0.0s
[CV 4/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=35;, score=0.010 total time=   0.0s
[CV 5/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=35;, score=0.012 total time=   0.0s
[CV 1/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=36;, score=0.018 total time=   0.0s
[CV 2/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=36;, score=0.005 total time=   0.0s
[CV 3/5] END XGB__learning_rate=0.001, XGB__max_depth=2, XGB__n_estimators=36;, score=0.017 total time=   0.0s
[CV 4/5] END XGB__learning_rate=0.001, XGB__max_de

In [44]:
print(gcv.best_score_)
print(gcv.best_params_)

0.5004059314727783
{'XGB__learning_rate': 0.25075, 'XGB__max_depth': 4, 'XGB__n_estimators': 36}


# LightGBM

In [48]:


# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=['category', object])),
    (ohe, make_column_selector(dtype_include=['category', object])),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

lgbm = LGBMRegressor()

pipe = Pipeline([("OHE", trans_ohe), ("SCL", scl_mm), ("LGB", lgbm)])


# _____________________________________________________________________________________
# GCV

params = {
    "LGB__n_estimators" : [10, 20, 30, 40, 50],
    "LGB__max_depth" : [2, 3, 4],
    "LGB__learning_rate" : np.linspace(0.001, 1, 5)
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "r2",
                  cv = kfold,
                  verbose = 3)


# gcv.fit(x, y)

In [50]:
gcv.fit(x, y)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 238
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 17
[LightGBM] [Info] Start training from score 0.735678
[CV 1/5] END LGB__learning_rate=0.001, LGB__max_depth=2, LGB__n_estimators=10;, score=0.005 total time=   0.5s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 241
[LightGBM] [Info] Number of data points in the train set: 957, number of used features: 17
[LightGBM] [Info] Start training from score 0.731576
[CV 2/5] END LGB__learning_rate=0.001, LGB__max_depth=2, LGB__n_estimators=10;, score

In [52]:
print(gcv.best_score_)
print(gcv.best_params_)

0.5000940577036161
{'LGB__learning_rate': 0.25075, 'LGB__max_depth': 4, 'LGB__n_estimators': 50}


# CatBoost

In [55]:


# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=['category', object])),
    (ohe, make_column_selector(dtype_include=['category', object])),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

cbr = CatBoostRegressor(
    cat_features= list(x.columns[x.dtypes == object])
)

pipe = Pipeline([("CBR", cbr)])


# _____________________________________________________________________________________
# GCV

params = {
    "CBR__n_estimators" : [10, 20, 30, 40, 50],
    "CBR__max_depth" : [2, 3, 4],
    "CBR__learning_rate" : np.linspace(0.001, 1, 5)
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "r2",
                  cv = kfold,
                  verbose = 3)


# gcv.fit(x, y)

In [57]:
gcv.fit(x, y)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
0:	learn: 0.1734874	total: 150ms	remaining: 1.35s
1:	learn: 0.1734578	total: 164ms	remaining: 657ms
2:	learn: 0.1734308	total: 173ms	remaining: 404ms
3:	learn: 0.1734017	total: 193ms	remaining: 290ms
4:	learn: 0.1733730	total: 209ms	remaining: 209ms
5:	learn: 0.1733461	total: 223ms	remaining: 148ms
6:	learn: 0.1733216	total: 239ms	remaining: 103ms
7:	learn: 0.1732882	total: 253ms	remaining: 63.2ms
8:	learn: 0.1732544	total: 266ms	remaining: 29.6ms
9:	learn: 0.1732208	total: 280ms	remaining: 0us
[CV 1/5] END CBR__learning_rate=0.001, CBR__max_depth=2, CBR__n_estimators=10;, score=0.004 total time=   0.3s
0:	learn: 0.1743353	total: 12.7ms	remaining: 114ms
1:	learn: 0.1742966	total: 20.7ms	remaining: 82.7ms
2:	learn: 0.1742651	total: 29.5ms	remaining: 68.9ms
3:	learn: 0.1742282	total: 43.3ms	remaining: 64.9ms
4:	learn: 0.1741965	total: 59.4ms	remaining: 59.4ms
5:	learn: 0.1741678	total: 75.9ms	remaining: 50.6ms
6:	learn: 0.1741

In [59]:
print(gcv.best_score_)
print(gcv.best_params_)

0.5002655572520116
{'CBR__learning_rate': 0.5005, 'CBR__max_depth': 3, 'CBR__n_estimators': 50}
