In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("ADS_Data.csv")

In [3]:
df.head()

Unnamed: 0,Date,MRP,NoPromoPrice,SellingPrice,UnitsSold,LogSellingPrice,StockStart,Demand,DemandFulfilled,Backorders,...,ReturningVisitorRatio,AvgSessionDuration_sec,MRP_Nirma,MRP_Surf Excel,DiscountRate_Nirma,DiscountRate_Surf Excel,BasePrice_Nirma,BasePrice_Surf Excel,FinalPrice_Nirma,FinalPrice_Surf Excel
0,01-01-2021,99.0,97.95,97.95,12,4.584457,15662,1831,1831,0,...,0.2411,83.59,90,110,0.0,0.0,84.67,107.24,84.67,107.24
1,02-01-2021,99.0,85.09,85.09,10,4.44371,14162,2097,2097,0,...,0.2132,50.25,90,110,0.0,0.07,81.16,107.9,81.16,100.35
2,03-01-2021,99.0,92.93,92.93,4,4.531847,12691,1837,1814,23,...,0.3151,21.43,90,110,0.0,0.0,79.33,105.83,79.33,105.83
3,04-01-2021,99.0,90.24,90.24,7,4.502473,13081,1588,1588,0,...,0.3321,17.01,90,110,0.0,0.0,76.72,101.36,76.72,101.36
4,05-01-2021,99.0,92.67,92.67,12,4.529045,13358,1912,1912,0,...,0.335,35.88,90,110,0.24,0.0,70.8,107.07,60.0,107.07


In [5]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn


# Load Data
df = pd.read_csv("ADS_Data.csv")


# Preprocessing & Feature Engineering
# -------------------------------
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['DiscountPct'] = (df['MRP'] - df['SellingPrice']) / df['MRP']
df['PriceGap_Nirma'] = df['SellingPrice'] - df['FinalPrice_Nirma']
df['PriceGap_SurfExcel'] = df['SellingPrice'] - df['FinalPrice_Surf Excel']
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['Month'] = df['Date'].dt.month
df['Lag7_UnitsSold'] = df['UnitsSold'].shift(7)
df['Rolling7_UnitsSold'] = df['UnitsSold'].rolling(7).mean()
df_model = df.dropna().copy()

# -------------------------------
# 4️⃣ Features and Target
# -------------------------------
target = 'UnitsSold'
features = [
    'SellingPrice', 'DiscountPct', 'MRP', 
    'StockStart', 'StockEnd', 'Demand', 'Backorders', 'LeadTimeFloat', 'SafetyStock',
    'CTR', 'AbandonedCartRate', 'BounceRate', 'FunnelDrop_ViewToCart',
    'FunnelDrop_CartToCheckout', 'ReturningVisitorRatio', 'AvgSessionDuration_sec',
    'PriceGap_Nirma', 'PriceGap_SurfExcel',
    'DayOfWeek', 'Month', 'Lag7_UnitsSold', 'Rolling7_UnitsSold'
]
X = df_model[features]
y = df_model[target]

# -------------------------------
# 5️⃣ Train-Test Split
# -------------------------------
split_idx = int(len(df_model) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# -------------------------------
# 6️⃣ Set up MLflow Experiment
# -------------------------------
mlflow.set_experiment("Tide_Pricing_Optimization")

# -------------------------------
# 7️⃣ Hyperparameter Grid
# -------------------------------
param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

with mlflow.start_run(run_name="RandomForest_GridSearch"):

    # -------------------------------
    # 8️⃣ Grid Search with CV
    # -------------------------------
    rf_model = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid=param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    # Log best hyperparameters
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("features", features)
    
    # -------------------------------
    # 9️⃣ Predict & Evaluate
    # -------------------------------
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)  # compute RMSE manually
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    n = X_test.shape[0]
    p = X_test.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1)/(n - p - 1)
    
    # Log metrics
    mlflow.log_metrics({
        "RMSE": rmse,
        "MSE": mse,
        "MAE": mae,
        "R2": r2,
        "Adjusted_R2": adj_r2
    })
    
    # -------------------------------
    # 🔟 Residual Plot
    # -------------------------------
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=y_test, y=y_test - y_pred)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel("Actual UnitsSold")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    residual_plot_path = "residual_plot.png"
    plt.savefig(residual_plot_path)
    mlflow.log_artifact(residual_plot_path)
    plt.close()
    
    # -------------------------------
    # 1️⃣1️⃣ Feature Importance
    # -------------------------------
    importances = best_model.feature_importances_
    fi_df = pd.DataFrame({"Feature": features, "Importance": importances}).sort_values("Importance", ascending=False)
    
    plt.figure(figsize=(10,6))
    sns.barplot(x="Importance", y="Feature", data=fi_df)
    plt.title("Feature Importance")
    fi_plot_path = "feature_importance.png"
    plt.savefig(fi_plot_path)
    mlflow.log_artifact(fi_plot_path)
    plt.close()
    
    # -------------------------------
    # 1️⃣2️⃣ Log Model with Signature
    # -------------------------------
    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(best_model, "tide_pricing_model", signature=signature)
    
    # -------------------------------
    # 1️⃣3️⃣ Revenue Simulation (Optional)
    # -------------------------------
    df_test = df_model.iloc[split_idx:].copy()
    df_test['PredictedUnitsSold'] = y_pred
    df_test['PredictedRevenue'] = df_test['PredictedUnitsSold'] * df_test['SellingPrice']
    total_revenue = df_test['PredictedRevenue'].sum()
    mlflow.log_metric("TotalPredictedRevenue", total_revenue)
    
    print(f"Best Params: {grid_search.best_params_}")
    print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}, Adjusted R2: {adj_r2:.4f}")
    print(f"Total predicted revenue for test period: {total_revenue:.2f}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Params: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500}
RMSE: 4.44, MAE: 3.66, R2: 0.0968, Adjusted R2: -0.3087
Total predicted revenue for test period: 55038.94
