In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
from joblib import dump

# Load the dataset
df = pd.read_csv(r"C:\Users\anoop\OneDrive\MS\1st Quarter\Data Mining ALY6040\Data mining Project\Train new.csv")

# Custom categorization functions
def categorize_discount(discount):
    if 0 <= discount <= 5:
        return 'Low'
    elif 5 < discount <= 20:
        return 'Medium'
    elif 20 < discount <= 100:
        return 'High'
    else:
        return 'Undefined'

def categorize_weight(weight):
    if 0 <= weight <= 1899:
        return 'Low'
    elif 1900 <= weight <= 4199:
        return 'Medium'
    elif 4200 <= weight <= 9000:
        return 'High'
    else:
        return 'Undefined'

# Adapted functions for pipeline
def categorize_discount_array(X):
    return np.array([categorize_discount(x) for x in np.squeeze(X)]).reshape(-1, 1)

def categorize_weight_array(X):
    return np.array([categorize_weight(x) for x in np.squeeze(X)]).reshape(-1, 1)

# Preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(transformers=[
    ('discount_category', Pipeline(steps=[
        ('categorize', FunctionTransformer(categorize_discount_array, validate=False)),
        ('encode', OrdinalEncoder(categories=[['Undefined', 'Low', 'Medium', 'High']]))
    ]), ['Discount_offered']),
    ('weight_category', Pipeline(steps=[
        ('categorize', FunctionTransformer(categorize_weight_array, validate=False)),
        ('encode', OrdinalEncoder(categories=[['Undefined', 'Low', 'Medium', 'High']]))
    ]), ['Weight_in_gms']),
], remainder='passthrough')

# Define model and pipeline
model = CatBoostClassifier(task_type='CPU', verbose=0, auto_class_weights='Balanced')
full_pipeline = Pipeline(steps=[('preprocess', preprocessing_pipeline), ('model', model)])

# Split data
X = df.drop(['Reached.on.Time_Y.N', 'ID'], axis=1)
y = df['Reached.on.Time_Y.N']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GridSearchCV for hyperparameter tuning
param_grid = {
    'model__depth': [6, 8, 10],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__iterations': [100, 500, 1000],
    'model__l2_leaf_reg': [1, 3, 5]
}
grid_search = GridSearchCV(estimator=full_pipeline, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

# Model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7, 7))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['Not On Time', 'On Time'], rotation=45)
plt.yticks(tick_marks, ['Not On Time', 'On Time'])
plt.tight_layout()
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.figure()
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

# Save the best model from GridSearch
dump(grid_search.best_estimator_, 'trained_pipeline.joblib')

print("The trained pipeline has been saved successfully.")

Fitting 3 folds for each of 81 candidates, totalling 243 fits


ValueError: 
All the 243 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
243 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\compose\_column_transformer.py", line 727, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\compose\_column_transformer.py", line 658, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
             ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
    self.results = batch()
                   ^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\pipeline.py", line 437, in fit_transform
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\base.py", line 881, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 238, in transform
    return self._transform(X, func=self.func, kw_args=self.kw_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anoop\anaconda3\envs\Data_ALY_6040\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 310, in _transform
    return func(X, **(kw_args if kw_args else {}))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: transform_with_categories() got multiple values for argument 'func'
