In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import numpy as np
import joblib

In [None]:
# Load your data (use your actual file path)
file_path = '../data/inventory.csv'
data = pd.read_csv(file_path)

In [6]:

data = data[data['Type'] == 'Material']
data['Date'] = pd.to_datetime(data['Date'])
data['Material Code'] = data['Material Code'].astype(str)

data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock'])

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(7).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# Encode Material Code as a categorical feature
encoder = LabelEncoder()
data['Material Code Encoded'] = encoder.fit_transform(data['Material Code'])

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Month', 'Year', 'Material Code Encoded', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday', 'BFP']
X = data[features]
y = data['Material Issued']


# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (XGBoost - Material)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)

    # Define model
    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bytree", colsample_bytree)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = XGBRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    random_state=42
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

best_model.fit(X_train, y_train)
predictions = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_xgboost_model_material.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.xgboost.log_model(best_model, "model")

print("Hyperparameter tuning and logging to specified experiment complete.")


Best Parameters: {'n_estimators': 93, 'max_depth': 3, 'learning_rate': 0.03528727863319686, 'subsample': 0.7353924419153355, 'colsample_bytree': 0.9960774589920796}
Best Model RMSE: 227.59094078527252




Hyperparameter tuning and logging to specified experiment complete.


In [7]:
import json

# Save best_params to a JSON file
with open('best_params_xgboost_material.json', 'w') as json_file:
    json.dump(best_params, json_file)


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import numpy as np
import joblib

# Load your data (use your actual file path)
file_path = '../data/inventory.csv'
data = pd.read_csv(file_path)

data = data[data['Type'] == 'Material']
data['Date'] = pd.to_datetime(data['Date'])
data['Material Code'] = data['Material Code'].astype(str)

data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock', 'Price'])

# Calculate Amount Issued and Amount Received
data['Amount Issued'] = data['Price'] * data['Material Issued']
data['Amount Received'] = data['Price'] * data['Material Received']

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['Quarter'] = data['Date'].dt.quarter

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)
data['Lag_1_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(7).mean().reset_index(level=0, drop=True)
data['Rolling_7_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].rolling(7).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Cumulative Features
data['Cumulative_Issued'] = data.groupby('Material Code')['Material Issued'].cumsum()
data['Cumulative_Received'] = data.groupby('Material Code')['Material Received'].cumsum()
data['Cumulative_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].cumsum()

# Interaction Features
data['Price_Issued_Interaction'] = data['Price'] * data['Material Issued']

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# Encode Material Code as a categorical feature
encoder = LabelEncoder()
data['Material Code Encoded'] = encoder.fit_transform(data['Material Code'])

# One-hot encode Month and Quarter
data = pd.get_dummies(data, columns=['Month', 'Quarter'], drop_first=True)

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Year', 'Material Code Encoded', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday', 'BFP', 'Lag_1_Amount_Issued', 'Rolling_7_Amount_Issued', 'Cumulative_Issued', 'Cumulative_Received', 'Cumulative_Amount_Issued', 'Price_Issued_Interaction'] + [col for col in data.columns if col.startswith('Month_') or col.startswith('Quarter_')]
X = data[features]
y = data['Material Issued']

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (XGBoost - Material - Added Features)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)

    # Define model
    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bytree", colsample_bytree)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = XGBRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    random_state=42
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

best_model.fit(X_train, y_train)
predictions = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_xgboost_model_material_features_added.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow with dependencies and input example
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.xgboost.log_model(
        best_model,
        "model",
        pip_requirements=["xgboost==2.1.3", "pandas", "numpy", "scikit-learn"],
        input_example=X_train.iloc[:5]  # Provide a sample of the input data
    )

print("Hyperparameter tuning and logging to specified experiment complete.")


Best Parameters: {'n_estimators': 80, 'max_depth': 15, 'learning_rate': 0.043372000935757236, 'subsample': 0.7157188226215301, 'colsample_bytree': 0.9773657559025434}
Best Model RMSE: 80.91429457321547




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  "dataframe_split": {
    "columns": [
      "Open Stock",
      "Closing Stock",
      "Day",
      "Year",
      "Material Code Encoded",
      "Lag_1_Issued",
      "Rolling_7_Issued",
      "Weekday",
      "BFP",
      "Lag_1_Amount_Issued",
      "Rolling_7_Amount_Issued",
      "Cumulative_Issued",
      "Cumulative_Received",
      "Cumulative_Amount_Issued",
      "Price_Issued_Interaction",
      "Month_2",
      "Month_3",
      "Month_4",
      "Month_5",
      "Month_6",
      "Month_7",
      "Month_8",
      "Month_9",
      "Month_10",
      "Month_11",
      "Month_12",
      "Quarter_2",
      "Quarter_3",
      "Quarter_4"
    ],
    "data": [
      [
        88.0,
        88.0,
        1,
        2023,
        0,
        0.0,
        0.0,
        6,
        0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        false,
        false,
        false,
        false,
        false,
        false,
        false,
        false,
        fa

Hyperparameter tuning and logging to specified experiment complete.


In [3]:
import json

# Save best_params to a JSON file
with open('best_params_xgboost_material_features_added.json', 'w') as json_file:
    json.dump(best_params, json_file)


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import numpy as np
import joblib

# Load your data (use your actual file path)
file_path = '../data/inventory.csv'
data = pd.read_csv(file_path)

data = data[data['Type'] == 'Material']
data['Date'] = pd.to_datetime(data['Date'])
data['Material Code'] = data['Material Code'].astype(str)

data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock'])
data.drop(columns=['Material Description'], inplace=True)

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(7).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# Encode Material Code using one-hot encoding
data = pd.get_dummies(data, columns=['Material Code'], prefix='Material')

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Month', 'Year', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday', 'BFP'] + [col for col in data.columns if col.startswith('Material')]
X = data[features]
y = data['Material Issued']

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (XGBoost - Material - One-Hot Encoded)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)

    # Define model
    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bytree", colsample_bytree)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = XGBRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    random_state=42
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

best_model.fit(X_train, y_train)
predictions = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_xgboost_model_material.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.xgboost.log_model(best_model, "model")

print("Hyperparameter tuning and logging to specified experiment complete.")


Best Parameters: {'n_estimators': 179, 'max_depth': 6, 'learning_rate': 0.027181060081164225, 'subsample': 0.84623016278226, 'colsample_bytree': 0.621326407063899}
Best Model RMSE: 96.9911023162212




Hyperparameter tuning and logging to specified experiment complete.


In [6]:
import json

# Save best_params to a JSON file
with open('best_params_xgboost_material_ohe.json', 'w') as json_file:
    json.dump(best_params, json_file)


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import numpy as np
import joblib

# Load your data (use your actual file path)
file_path = '../data/inventory.csv'
data = pd.read_csv(file_path)

data = data[data['Type'] == 'Material']
data['Date'] = pd.to_datetime(data['Date'])
data['Material Code'] = data['Material Code'].astype(str)

data.sort_values('Date', inplace=True)
data = data.dropna(subset=['Material Issued', 'Open Stock', 'Closing Stock', 'Price'])
data.drop(columns=['Material Description'], inplace=True)

# Calculate Amount Issued and Amount Received
data['Amount Issued'] = data['Price'] * data['Material Issued']
data['Amount Received'] = data['Price'] * data['Material Received']

# Feature Engineering
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['Quarter'] = data['Date'].dt.quarter

# Add 1-day Lag Feature
data['Lag_1_Issued'] = data.groupby('Material Code')['Material Issued'].shift(1)
data['Lag_1_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].shift(1)

# Add 7-day Rolling Statistics
data['Rolling_7_Issued'] = data.groupby('Material Code')['Material Issued'].rolling(7).mean().reset_index(level=0, drop=True)
data['Rolling_7_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].rolling(7).mean().reset_index(level=0, drop=True)

# Add Seasonality Indicator (e.g., Weekday)
data['Weekday'] = data['Date'].dt.weekday  # 0=Monday, 6=Sunday

# Cumulative Features
data['Cumulative_Issued'] = data.groupby('Material Code')['Material Issued'].cumsum()
data['Cumulative_Received'] = data.groupby('Material Code')['Material Received'].cumsum()
data['Cumulative_Amount_Issued'] = data.groupby('Material Code')['Amount Issued'].cumsum()

# Interaction Features
data['Price_Issued_Interaction'] = data['Price'] * data['Material Issued']

# Fill NaN values introduced by lagging and rolling
data.fillna(0, inplace=True)

# One-hot encode Material Code, Month, and Quarter
data = pd.get_dummies(data, columns=['Material Code', 'Month', 'Quarter'], drop_first=True)

# Target and Features
features = ['Open Stock', 'Closing Stock', 'Day', 'Year', 'Lag_1_Issued', 'Rolling_7_Issued', 'Weekday', 'BFP', 'Lag_1_Amount_Issued', 'Rolling_7_Amount_Issued', 'Cumulative_Issued', 'Cumulative_Received', 'Cumulative_Amount_Issued', 'Price_Issued_Interaction'] + [col for col in data.columns if col.startswith('Material Code_') or col.startswith('Month_') or col.startswith('Quarter_')]
X = data[features]
y = data['Material Issued']

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")

# Create MLflow experiment
experiment_name = "Hyperparameter Tuning with Optuna (XGBoost - Material - Added Features - One-Hot Encoded)"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = experiment.experiment_id

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)

    # Define model
    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )

    # Time-based cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    rmse_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)

    # Log parameters and metrics to MLflow
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bytree", colsample_bytree)
        mlflow.log_metric("rmse", mean_rmse)

    return mean_rmse

# Suppress experimental warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Run Optuna optimization
study = optuna.create_study(study_name=experiment_name, direction="minimize", storage="sqlite:///optuna.db", load_if_exists=True)
study.optimize(objective, n_trials=50)

# Best trial results
best_params = study.best_params
print("Best Parameters:", best_params)

# Train and evaluate the best model
best_model = XGBRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    random_state=42
)

# Time-based split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

best_model.fit(X_train, y_train)
predictions = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Best Model RMSE: {rmse}")

# Save the model as a .pkl file
model_path = "best_xgboost_model_material_features_added.pkl"
joblib.dump(best_model, model_path)

# Log the best model to MLflow with dependencies and input example
with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(best_params)
    mlflow.log_metric("final_rmse", rmse)
    mlflow.xgboost.log_model(
        best_model,
        "model",
        pip_requirements=["xgboost==2.1.3", "pandas", "numpy", "scikit-learn"],
        input_example=X_train.iloc[:5]  # Provide a sample of the input data
    )

print("Hyperparameter tuning and logging to specified experiment complete.")


Best Parameters: {'n_estimators': 349, 'max_depth': 5, 'learning_rate': 0.07558172057059602, 'subsample': 0.6701051897300988, 'colsample_bytree': 0.8108020702912683}
Best Model RMSE: 74.63251207059598




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  "dataframe_split": {
    "columns": [
      "Open Stock",
      "Closing Stock",
      "Day",
      "Year",
      "Lag_1_Issued",
      "Rolling_7_Issued",
      "Weekday",
      "BFP",
      "Lag_1_Amount_Issued",
      "Rolling_7_Amount_Issued",
      "Cumulative_Issued",
      "Cumulative_Received",
      "Cumulative_Amount_Issued",
      "Price_Issued_Interaction",
      "Material Code_10000045",
      "Material Code_10000046",
      "Material Code_10000195",
      "Material Code_10000200",
      "Material Code_10000304",
      "Material Code_10000314",
      "Material Code_10000315",
      "Material Code_10000316",
      "Material Code_10000317",
      "Material Code_10000318",
      "Material Code_10000319",
      "Material Code_10000320",
      "Material Code_10000324",
      "Material Code_10000325",
      "Material Code_10000326",
      "Material Code_10000327",
      "Material Code_10000328",
      "Material Code_10000329",
      "Material Code_10000330",
      "Material Co

Hyperparameter tuning and logging to specified experiment complete.


In [9]:
import json

# Save best_params to a JSON file
with open('best_params_xgboost_material_features_added_ohe.json', 'w') as json_file:
    json.dump(best_params, json_file)
