In [1]:
import pandas as pd

df = pd.read_csv("retail_profit_margin_dataset_30k.csv")

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          30000 non-null  object 
 1   product_name        30000 non-null  object 
 2   category            30000 non-null  object 
 3   brand               30000 non-null  object 
 4   store_id            30000 non-null  object 
 5   store_name          30000 non-null  object 
 6   store_location      30000 non-null  object 
 7   base_price          30000 non-null  float64
 8   discount_rate       30000 non-null  float64
 9   promotion_type      18001 non-null  object 
 10  day_of_year         30000 non-null  int64  
 11  month               30000 non-null  int64  
 12  day_of_week         30000 non-null  object 
 13  season              30000 non-null  object 
 14  is_holiday          30000 non-null  int64  
 15  avg_units_sold_30d  30000 non-null  int64  
 16  avg_

In [8]:
# Fix promotion_type specifically
for df_ in [X_train, X_test]:
    df_["promotion_type"] = (
        df_["promotion_type"]
        .fillna("No_Promotion")
        .astype(str)
    )

# Ensure other categorical features are strings (no NaNs expected there)
for col in high_card_cat + low_card_cat:
    if col != "promotion_type":
        X_train[col] = X_train[col].astype(str)
        X_test[col] = X_test[col].astype(str)


In [9]:
from sklearn.model_selection import train_test_split

# =====================
# 1. Define target
# =====================
target = "profit_margin"

# =====================
# 2. Columns to drop
# (IDs & names not useful directly)
# =====================
drop_cols = [
    "product_name",
    "store_name"
]

# =====================
# 3. Feature groups
# =====================

# High-cardinality categorical features
high_card_cat = [
    "product_id",
    "store_id"
]

# Low-cardinality categorical features
low_card_cat = [
    "category",
    "brand",
    "store_location",
    "promotion_type",
    "day_of_week",
    "season"
]

# Numerical features
num_features = [
    "base_price",
    "discount_rate",
    "day_of_year",
    "month",
    "is_holiday",
    "avg_units_sold_30d",
    "avg_customers_30d"
]

# =====================
# 4. Prepare X and y
# =====================
X = df.drop(columns=drop_cols + [target])
y = df[target]

# =====================
# 5. Train / Test split
# =====================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# =====================
# 6. Sanity check
# =====================
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

print("\nNumerical features:", num_features)
print("Low-card categorical:", low_card_cat)
print("High-card categorical:", high_card_cat)


Train shape: (24000, 15)
Test shape: (6000, 15)

Numerical features: ['base_price', 'discount_rate', 'day_of_year', 'month', 'is_holiday', 'avg_units_sold_30d', 'avg_customers_30d']
Low-card categorical: ['category', 'brand', 'store_location', 'promotion_type', 'day_of_week', 'season']
High-card categorical: ['product_id', 'store_id']


In [None]:
# Final safety cleanup for ALL categorical features
cat_features = high_card_cat + low_card_cat

for col in cat_features:
    X_train[col] = X_train[col].astype(str).fillna("No_Promotion")
    X_test[col] = X_test[col].astype(str).fillna("No_Promotion")


In [11]:
X_train[cat_features].isna().sum()


product_id           0
store_id             0
category             0
brand                0
store_location       0
promotion_type    9620
day_of_week          0
season               0
dtype: int64

In [12]:
# Explicit, domain-correct fix
X_train["promotion_type"] = (
    X_train["promotion_type"]
    .fillna("No_Promotion")
    .astype(str)
)

X_test["promotion_type"] = (
    X_test["promotion_type"]
    .fillna("No_Promotion")
    .astype(str)
)

# Safety: ensure all other categorical columns are strings
for col in high_card_cat + low_card_cat:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

In [13]:
X_train[cat_features].isna().sum()

product_id        0
store_id          0
category          0
brand             0
store_location    0
promotion_type    0
day_of_week       0
season            0
dtype: int64

In [14]:
from catboost import CatBoostRegressor

cat_features = high_card_cat + low_card_cat

cat_model_tuned = CatBoostRegressor(
    iterations=2000,              # allow many trees
    learning_rate=0.03,           # smaller LR = better learning
    depth=6,                      # less overfitting
    l2_leaf_reg=5,                # regularization
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    early_stopping_rounds=100,    # VERY important
    verbose=100
)

cat_model_tuned.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
    use_best_model=True
)


0:	learn: 0.2234038	test: 0.2256535	best: 0.2256535 (0)	total: 211ms	remaining: 7m 2s
100:	learn: 0.1081897	test: 0.1085160	best: 0.1085160 (100)	total: 4.13s	remaining: 1m 17s
200:	learn: 0.1070777	test: 0.1074293	best: 0.1074293 (200)	total: 7.76s	remaining: 1m 9s
300:	learn: 0.1067506	test: 0.1073689	best: 0.1073686 (298)	total: 11.7s	remaining: 1m 5s
400:	learn: 0.1064229	test: 0.1073315	best: 0.1073290 (377)	total: 15.8s	remaining: 1m 3s
500:	learn: 0.1060395	test: 0.1073291	best: 0.1073263 (460)	total: 20.8s	remaining: 1m 2s
600:	learn: 0.1056738	test: 0.1073200	best: 0.1073067 (576)	total: 25.5s	remaining: 59.3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1073067456
bestIteration = 576

Shrink model to first 577 iterations.


<catboost.core.CatBoostRegressor at 0x237be7a9910>

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = cat_model_tuned.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")


RMSE: 0.1073
R2: 0.7837


In [16]:
comparison_df = pd.DataFrame({
    "y_actual": y_test.values,
    "y_predicted": y_pred
})

# Optional: absolute error
comparison_df["error"] = comparison_df["y_actual"] - comparison_df["y_predicted"]
comparison_df["abs_error"] = comparison_df["error"].abs()
comparison_df.head(10)

Unnamed: 0,y_actual,y_predicted,error,abs_error
0,0.24,0.241893,-0.001893,0.001893
1,-0.061,0.133985,-0.194985,0.194985
2,0.226,0.345667,-0.119667,0.119667
3,0.463,0.347609,0.115391,0.115391
4,0.383,0.35177,0.03123,0.03123
5,0.069,0.216345,-0.147345,0.147345
6,0.092,0.162332,-0.070332,0.070332
7,0.051,0.181052,-0.130052,0.130052
8,0.425,0.34635,0.07865,0.07865
9,0.342,0.348747,-0.006747,0.006747


In [None]:
import mlflow
import mlflow.catboost

mlflow.set_experiment("profit_prediction_experiment")

with mlflow.start_run(run_name="CatBoost_Regressor"):

        # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("best_iteration", cat_model_tuned.get_best_iteration()) 

       # Log parameters
    params = cat_model_tuned.get_params()
    for key, value in params.items():
        mlflow.log_param(key, value)

        # Feature importance
    feature_importance = pd.DataFrame({
        "feature": X_train.columns,
        "importance": cat_model_tuned.get_feature_importance()
    }).sort_values(by="importance", ascending=False)

    feature_importance.to_csv("feature_importance_catboost.csv", index=False)
    mlflow.log_artifact("feature_importance_catboost.csv")

    
    # Log model
    mlflow.catboost.log_model(
        cat_model_tuned,
        artifact_path="model"
    )
    print("✅ CatBoost logged to MLflow successfully")

In [18]:
import mlflow
import mlflow.catboost
import pandas as pd

mlflow.set_experiment("Profit_Prediction_Experiment")

MODEL_NAME = "ProfitPrediction_CatBoost"

with mlflow.start_run(run_name="CatBoost_Regressor"):

    # ===== 1. Log metrics =====
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("best_iteration", cat_model_tuned.get_best_iteration())

    # ===== 2. Log parameters =====
    params = cat_model_tuned.get_params()
    for key, value in params.items():
        mlflow.log_param(key, value)

    # ===== 3. Feature importance =====
    feature_importance = pd.DataFrame({
        "feature": X_train.columns,
        "importance": cat_model_tuned.get_feature_importance()
    }).sort_values(by="importance", ascending=False)

    feature_importance.to_csv("feature_importance_catboost.csv", index=False)
    mlflow.log_artifact("feature_importance_catboost.csv")

    # ===== 4. Log + REGISTER model =====
    mlflow.catboost.log_model(
        cat_model_tuned,
        artifact_path="model",
        registered_model_name=MODEL_NAME
    )

    # ===== 5. Add model description =====
    client = mlflow.tracking.MlflowClient()

    latest_version = client.get_latest_versions(
        MODEL_NAME, stages=["None"]
    )[0].version

    description = f"""
    CatBoost Regressor for profit prediction.

    - Handles categorical features natively (no one-hot or scaling required)
    - Trained with early stopping to prevent overfitting
    - Best iteration: {cat_model_tuned.get_best_iteration()}
    - RMSE: {rmse:.4f}
    - R²: {r2:.4f}

    This model significantly outperforms previous baselines
    (Linear Regression and Gradient Boosting) and is registered
    as a candidate model for further evaluation.
    """

    client.update_model_version(
        name=MODEL_NAME,
        version=latest_version,
        description=description
    )

    print(f"✅ CatBoost logged and registered as {MODEL_NAME}, version {latest_version}")




✅ CatBoost logged and registered as ProfitPrediction_CatBoost, version 2


Registered model 'ProfitPrediction_CatBoost' already exists. Creating a new version of this model...
Created version '2' of model 'ProfitPrediction_CatBoost'.
  latest_version = client.get_latest_versions(


In [20]:
feature_importance.head(10)



Unnamed: 0,feature,importance
6,discount_rate,86.638309
7,promotion_type,7.410392
1,category,0.91026
2,brand,0.817871
4,store_location,0.594394
5,base_price,0.533612
3,store_id,0.48594
10,day_of_week,0.478768
11,season,0.475016
0,product_id,0.472557


In [21]:

feature_importance.tail(10)


Unnamed: 0,feature,importance
5,base_price,0.533612
3,store_id,0.48594
10,day_of_week,0.478768
11,season,0.475016
0,product_id,0.472557
8,day_of_year,0.353773
14,avg_customers_30d,0.332833
13,avg_units_sold_30d,0.322954
9,month,0.089963
12,is_holiday,0.083356
