In [0]:
# import sys
# from pathlib import Path
# sys.path
# import sys
# sys.path.append(str(Path.cwd().parent / 'src'))

In [0]:
import numpy as np
import pandas as pd
import mlflow
import lightgbm as lgb
import pyspark.sql.functions as F
from sklearn.preprocessing import LabelEncoder
from databricks.connect import DatabricksSession
from src.pricing.config import ProjectConfig
from src.pricing.data_processor import DataProcessor, FeatureProducer
from src.pricing.models.baseline_model import LGBMModel
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup

spark = DatabricksSession.builder.getOrCreate()
config = ProjectConfig.from_yaml(config_path="../project_config.yml", env="dev")

# Enable automatic reloading of modules
%load_ext autoreload
%autoreload 2

In [0]:
fe = FeatureEngineeringClient()
# data_processor = DataProcessor(config, spark)
# df_raw_sales = data_processor.preprocess()

# # 2. Instantiate and run the FeatureProducer
# feature_producer = FeatureProducer(config, spark)
# df_features_to_publish = feature_producer.generate_features_spark(df_raw_sales)

# Define the Feature Store table path
feature_table_name = f"{config.catalog_name}.{config.schema_name}.price_features"
training_label_cols = ["id", "date", "demand", "item_id", "store_id", "dept_id", "cat_id", "state_id"]

training_set_labels = spark.sql(f"select distinct {','.join(training_label_cols)} from {feature_table_name}")

feature_names = ['wday', 'month', 'year', 'dayofweek', 'dayofyear', 'week', 'snap_CA', 'snap_TX', 'snap_WI', 
                 'sell_price', 'days_since_first_sale', 'is_event', 'lag_t7', 'rolling_mean_lag7_w7', 'lag_t28', 'rolling_mean_lag28_w7', 'item_running_avg', 'store_running_avg']

feature_lookups = [
    FeatureLookup(
        table_name=feature_table_name,
        feature_names=feature_names,
        lookup_key="id",
        timestamp_lookup_key="date"
    )
]

training_set = fe.create_training_set(
    df=training_set_labels,
    feature_lookups=feature_lookups,
    #exclude_columns=["item_store_id"],
    label="demand",
)

# Get the final Spark DataFrame for training
training_df = training_set.load_df()

##############################
# sample of data for testing
##############################
selected_items = [row.item_id for row in training_df.select("item_id").distinct().limit(10).collect()]
selected_stores = [row.store_id for row in training_df.select("store_id").distinct().limit(10).collect()]
training_df = training_df.filter(
    (F.col("item_id").isin(selected_items)) &
    (F.col("store_id").isin(selected_stores))
)

# Run the cross-validation loop on the final training DataFrame
training_df_pandas = training_df.toPandas()
training_df_pandas.dropna(inplace=True)

# label encoding for categorical data
le = LabelEncoder()
categorical_cols = ['item_id', 'store_id', 'dept_id', 'cat_id', 'state_id']

for col in categorical_cols:
    # Check if the column exists to prevent errors
    if col in training_df_pandas.columns:
        # Fit and transform the data in place
        training_df_pandas[col] = le.fit_transform(training_df_pandas[col])

training_df_pandas.head()

In [0]:
# Instantiate and train the model
lgbm_params = {
    'objective': 'poisson',
    'metric': 'rmse',
    'n_estimators': 2_000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

demand_model = LGBMModel(spark=spark, config=config, training_df=training_df_pandas.copy(deep=True), params=lgbm_params, model_type='forecast', baseline_forecasts_daily_dict = {})
demand_model.train_with_cv()
demand_model.log_model()

In [0]:
lgb.plot_importance(demand_model.ensemble_models[0], importance_type="gain", precision=0, height=0.5, figsize=(6, 10));

In [0]:
import mlflow
logged_model = 'runs:/71ce2a4b05de4994a69939c26e8104bf/lightgbm-ensemble-model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# should make baseline predictions based on training data so we can pass to the elasticity model
baseline_features_to_predict = demand_model.X_train[demand_model.features]
baseline_predictions_array = loaded_model.predict(baseline_features_to_predict)

baseline_forecasts_daily_dict = {}
for i, (idx, row) in enumerate(demand_model.X_train.iterrows()):
    item_id = row['item_id']
    store_id = row['store_id']
    date_string = row['date'].strftime('%Y-%m-%d')
    prediction = baseline_predictions_array[i]
    
    key = (item_id, store_id, date_string)
    baseline_forecasts_daily_dict[key] = prediction

In [0]:
# logged_model = mlflow.get_logged_model(demand_model.model_info.model_id)
# model = mlflow.sklearn.load_model(f"models:/{demand_model.model_info.model_id}")

# run_id = mlflow.search_runs(
#     experiment_names=["/Shared/marvel-characters-basic"]
# ).run_id[0]

# model = mlflow.sklearn.load_model(f"runs:/{run_id}/lightgbm-pipeline-model")

In [0]:
# Instantiate and train the model
lgbm_params = {
    'objective': 'poisson',
    'metric': 'rmse',
    'n_estimators': 2_000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

elasticity_model = LGBMModel(spark=spark, config=config, training_df=training_df_pandas.copy(deep=True), params=lgbm_params, model_type='elasticity', baseline_forecasts_daily_dict=baseline_forecasts_daily_dict)
elasticity_model.train_with_cv()
elasticity_model.log_model()

In [0]:
training_df_pandas['date'].max(), demand_model.training_df_pandas['date'].max(), elasticity_model.training_df_pandas['date'].max(), demand_model.split_date

In [0]:
temp = elasticity_model.training_df_pandas
temp.head()

In [0]:
temp = elasticity_model.training_df_pandas.query('date >= @elasticity_model.split_date')
temp

In [0]:
elasticity_model.split_date, elasticity_model.training_df_pandas['date'].max()