In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import numpy as np
import joblib

# Load your data (use your actual file path)
file_path = '../data/combined_data.csv'
data = pd.read_csv(file_path)

# Renaming the columns as per the user's mapping
data.rename(columns={
    'Material': 'Material Code',
    'Material Description': 'Material Description',
    'Opening Stock': 'Open Stock',
    'Total Issue Quantities': 'Material Issued',
    'Total Receipt Qties': 'Material Received',
    'Closing Stock': 'Closing Stock',
    'From Date': 'Date',
    'BUn': 'Unit'
}, inplace=True)

# Preprocessing
data['Date'] = pd.to_datetime(data['Date'], format="%d.%m.%Y")
data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock'])
data['Material Issued'] = data['Material Issued'].abs()
data['Material Code'] = data['Material Code'].astype(str)

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(7).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Month', 'Year', 'Material Code', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday']
X = data[features]
y = data['Material Issued']

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (CatBoost)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    depth = trial.suggest_int("depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)

    # Define model
    model = CatBoostRegressor(
        iterations=n_estimators,
        depth=depth,
        learning_rate=learning_rate,
        subsample=subsample,
        cat_features=['Material Code'],
        random_state=42,
        verbose=0
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train, cat_features=['Material Code'])
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("depth", depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = CatBoostRegressor(
    iterations=best_params["n_estimators"],
    depth=best_params["depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    cat_features=['Material Code'],
    random_state=42,
    verbose=0
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

best_model.fit(X_train, y_train, cat_features=['Material Code'])
predictions = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_catboost_model.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.log_artifact(model_path, artifact_path="models")

print("Hyperparameter tuning and logging to specified experiment complete.")


Best Parameters: {'n_estimators': 455, 'depth': 5, 'learning_rate': 0.014084457904985567, 'subsample': 0.70839194529896}
Best Model RMSE: 214.87467987253933
Hyperparameter tuning and logging to specified experiment complete.


In [4]:
import json

# Save best_params to a JSON file
with open('best_params_catboost.json', 'w') as json_file:
    json.dump(best_params, json_file)

print("Best parameters saved to best_params.json")

Best parameters saved to best_params.json


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import numpy as np
import joblib

# Load your data (use your actual file path)
file_path = '../data/inventory.csv'
data = pd.read_csv(file_path)

data = data[data['Type'] == 'Material']
data['Date'] = pd.to_datetime(data['Date'])
data['Material Code'] = data['Material Code'].astype(str)

data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock'])

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(7).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Month', 'Year', 'Material Code', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday', 'BFP']
X = data[features]
y = data['Material Issued']

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (CatBoost - Material)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    iterations = trial.suggest_int("iterations", 50, 500)
    depth = trial.suggest_int("depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.6, 1.0)

    # Define model
    model = CatBoostRegressor(
        iterations=iterations,
        depth=depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bylevel=colsample_bylevel,
        random_state=42,
        verbose=0
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        train_pool = Pool(X_train, y_train, cat_features=['Material Code'])
        test_pool = Pool(X_test, y_test, cat_features=['Material Code'])
        model.fit(train_pool)
        preds = model.predict(test_pool)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("iterations", iterations)
        mlflow.log_param("depth", depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bylevel", colsample_bylevel)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = CatBoostRegressor(
    iterations=best_params["iterations"],
    depth=best_params["depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    colsample_bylevel=best_params["colsample_bylevel"],
    random_state=42,
    verbose=0
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

train_pool = Pool(X_train, y_train, cat_features=['Material Code'])
test_pool = Pool(X_test, y_test, cat_features=['Material Code'])

best_model.fit(train_pool)
predictions = best_model.predict(test_pool)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_catboost_model_material.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.catboost.log_model(best_model, "model", input_example=X_train.iloc[:5])

print("Hyperparameter tuning and logging to specified experiment complete.")


Best Parameters: {'iterations': 180, 'depth': 5, 'learning_rate': 0.07082428899762874, 'subsample': 0.9650214403608193, 'colsample_bylevel': 0.6490829813452552}
Best Model RMSE: 224.98576815377612




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Hyperparameter tuning and logging to specified experiment complete.


In [2]:
import json

# Save best_params to a JSON file
with open('best_params_catboost_material.json', 'w') as json_file:
    json.dump(best_params, json_file)

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import numpy as np
import joblib

# Load your data (use your actual file path)
file_path = '../data/inventory.csv'
data = pd.read_csv(file_path)

data = data[data['Type'] == 'Material']
data['Date'] = pd.to_datetime(data['Date'])
data['Material Code'] = data['Material Code'].astype(str)

data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock', 'Price'])

# Calculate Amount Issued and Amount Received
data['Amount Issued'] = data['Price'] * data['Material Issued']
data['Amount Received'] = data['Price'] * data['Material Received']

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['Quarter'] = data['Date'].dt.quarter

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)
data['Lag_1_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(7).mean().reset_index(level=0, drop=True)
data['Rolling_7_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].rolling(7).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Cumulative Features
data['Cumulative_Issued'] = data.groupby('Material Code')['Material Issued'].cumsum()
data['Cumulative_Received'] = data.groupby('Material Code')['Material Received'].cumsum()
data['Cumulative_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].cumsum()

# Interaction Features
data['Price_Issued_Interaction'] = data['Price'] * data['Material Issued']

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Year', 'Material Code', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday', 'BFP', 'Lag_1_Amount_Issued', 'Rolling_7_Amount_Issued', 'Cumulative_Issued', 'Cumulative_Received', 'Cumulative_Amount_Issued', 'Price_Issued_Interaction']
X = data[features]
y = data['Material Issued']

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (CatBoost - Material - Added Features)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    iterations = trial.suggest_int("iterations", 50, 500)
    depth = trial.suggest_int("depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.6, 1.0)

    # Define model
    model = CatBoostRegressor(
        iterations=iterations,
        depth=depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bylevel=colsample_bylevel,
        random_state=42,
        verbose=0
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        train_pool = Pool(X_train, y_train, cat_features=['Material Code'])
        test_pool = Pool(X_test, y_test, cat_features=['Material Code'])
        model.fit(train_pool)
        preds = model.predict(test_pool)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("iterations", iterations)
        mlflow.log_param("depth", depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bylevel", colsample_bylevel)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = CatBoostRegressor(
    iterations=best_params["iterations"],
    depth=best_params["depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    colsample_bylevel=best_params["colsample_bylevel"],
    random_state=42,
    verbose=0
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

train_pool = Pool(X_train, y_train, cat_features=['Material Code'])
test_pool = Pool(X_test, y_test, cat_features=['Material Code'])

best_model.fit(train_pool)
predictions = best_model.predict(test_pool)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_catboost_model_material_features_added.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow with dependencies and input example
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=best_model,
        conda_env={
            'name': 'catboost_env',
            'channels': ['defaults'],
            'dependencies': ['python=3.8.10', 'pip', {
                'pip': [
                    'mlflow',
                    'catboost==1.0.6',
                    'pandas',
                    'numpy',
                    'scikit-learn'
                ]
            }]
        },
        input_example=X_train.iloc[:5]  # Provide a sample of the input data
    )

print("Hyperparameter tuning and logging to specified experiment complete.")


Best Parameters: {'iterations': 418, 'depth': 6, 'learning_rate': 0.21849155230550643, 'subsample': 0.8535524515709212, 'colsample_bylevel': 0.9583336280226162}
Best Model RMSE: 95.27078769392703


MlflowException: `python_model` must be a PythonModel instance, callable object, or path to a script that uses set_model() to set a PythonModel instance or callable object.

In [None]:
import json

# Save best_params to a JSON file
with open('best_params_catboost_material_features_added.json', 'w') as json_file:
    json.dump(best_params, json_file)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import numpy as np
import joblib

# Load your data (use your actual file path)
file_path = '../data/monthly_inventory.csv'
data = pd.read_csv(file_path)

data = data[data['Type'] == 'Material']
data['Date'] = pd.to_datetime(data['Date'])
data['Material Code'] = data['Material Code'].astype(str)

data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock', 'Price'])

# Calculate Amount Issued and Amount Received
data['Amount Issued'] = data['Price'] * data['Material Issued']
data['Amount Received'] = data['Price'] * data['Material Received']

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['Quarter'] = data['Date'].dt.quarter

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)
data['Lag_1_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(3).mean().reset_index(level=0, drop=True)
data['Rolling_7_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].rolling(3).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Cumulative Features
data['Cumulative_Issued'] = data.groupby('Material Code')['Material Issued'].cumsum()
data['Cumulative_Received'] = data.groupby('Material Code')['Material Received'].cumsum()
data['Cumulative_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].cumsum()

# Interaction Features
data['Price_Issued_Interaction'] = data['Price'] * data['Material Issued']

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Year', 'Material Code', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday', 'BFP', 'Lag_1_Amount_Issued', 'Rolling_7_Amount_Issued', 'Cumulative_Issued', 'Cumulative_Received', 'Cumulative_Amount_Issued', 'Price_Issued_Interaction']
X = data[features]
y = data['Material Issued']

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (CatBoost - Monthly)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    iterations = trial.suggest_int("iterations", 50, 500)
    depth = trial.suggest_int("depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.6, 1.0)

    # Define model
    model = CatBoostRegressor(
        iterations=iterations,
        depth=depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bylevel=colsample_bylevel,
        random_state=42,
        verbose=0
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        train_pool = Pool(X_train, y_train, cat_features=['Material Code'])
        test_pool = Pool(X_test, y_test, cat_features=['Material Code'])
        model.fit(train_pool)
        preds = model.predict(test_pool)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("iterations", iterations)
        mlflow.log_param("depth", depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bylevel", colsample_bylevel)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = CatBoostRegressor(
    iterations=best_params["iterations"],
    depth=best_params["depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    colsample_bylevel=best_params["colsample_bylevel"],
    random_state=42,
    verbose=0
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

train_pool = Pool(X_train, y_train, cat_features=['Material Code'])
test_pool = Pool(X_test, y_test, cat_features=['Material Code'])

best_model.fit(train_pool)
predictions = best_model.predict(test_pool)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_catboost_model_material_features_added.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow with dependencies and input example
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=best_model,
        conda_env={
            'name': 'catboost_env',
            'channels': ['defaults'],
            'dependencies': ['python=3.8.10', 'pip', {
                'pip': [
                    'mlflow',
                    'catboost==1.0.6',
                    'pandas',
                    'numpy',
                    'scikit-learn'
                ]
            }]
        },
        input_example=X_train.iloc[:5]  # Provide a sample of the input data
    )

print("Hyperparameter tuning and logging to specified experiment complete.")


[W 2025-01-14 19:42:49,421] Trial 26 failed with parameters: {'iterations': 445, 'depth': 12, 'learning_rate': 0.04763191825317156, 'subsample': 0.6726097446898451, 'colsample_bylevel': 0.9514938333249534} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "d:\IBA-Project\iba-inventory-management-project\project_env\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\azeem\AppData\Local\Temp\ipykernel_33924\3093344631.py", line 99, in objective
    model.fit(train_pool)
  File "d:\IBA-Project\iba-inventory-management-project\project_env\Lib\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 