In [43]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import FunctionTransformer

In [69]:
# Load the dataset
data = pd.read_csv('synthetic_dataset3.csv')
data['Weight (g/ml)'] = data['Weight (g/ml)'].fillna(0)
data.drop(columns=['Final Price'], axis=1, inplace=True)

In [70]:
data.head()

Unnamed: 0,Product Name,Category,Location,MRP,Blinkit Price,Zepto Price,Instamart Price,Discount (%),Margin (%),Festive/Seasonal Impact,...,Min Stock,Max Stock,Customer Sentiment,Weight (g/ml),Weight Unit,Order Year,Order Month,Order Day,Order Hour,Order Time Category
0,17,3,1,0.333333,0.307714,0.306157,0.305886,13,0.85,3,...,0.025,0.12234,0,1.0,2,0.0,0.181818,0.8,0.913043,3
1,15,3,4,0.722222,0.726021,0.727159,0.727268,5,0.65,0,...,0.0,0.207447,1,0.249249,2,0.0,0.727273,0.9,0.130435,3
2,3,2,2,0.227778,0.227216,0.233157,0.225761,9,0.6,4,...,0.05,0.994681,0,0.0,1,0.0,0.454545,0.866667,0.434783,2
3,8,2,1,0.372222,0.375526,0.378305,0.376116,7,0.55,2,...,0.05,0.904255,1,1.0,2,0.0,0.181818,0.933333,0.521739,0
4,16,5,1,0.15,0.154113,0.148331,0.149332,12,0.55,5,...,0.425,0.962766,0,1.0,2,0.0,0.0,0.9,0.73913,1


In [71]:
# Separate features and target variable
X = data.drop('Discount (%)', axis=1)  # All other parameters as features
y = data['Discount (%)']  

In [72]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [73]:
# Initialize the XGBoost regressor
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Suitable for regression tasks
    n_estimators=100,               # Number of boosting rounds (trees)
    learning_rate=0.1,              # Controls the step size at each boosting step
    random_state=42
)

In [74]:
# Train the model
model.fit(X_train, y_train)


In [75]:
# Predict discount values on the test set
y_pred = model.predict(X_test)

In [76]:
# Print predictions on test data
print("Predictions on Test Data:")
print(y_pred)

Predictions on Test Data:
[17.835928 14.008185 13.638787 ... 18.66929  10.823193  9.907915]


In [77]:
# Optionally, create a DataFrame to compare actual vs. predicted values
results = pd.DataFrame({
    'Actual Discount (%)': y_test,
    'Predicted Discount (%)': y_pred
})
print("\nComparison of Actual and Predicted Discounts:")
print(results.head())


Comparison of Actual and Predicted Discounts:
      Actual Discount (%)  Predicted Discount (%)
6252                   18               17.835928
4684                   13               14.008185
1731                   14               13.638787
4742                   10                9.113073
4521                   10                9.900046


In [78]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nMean Squared Error:", mse)
print("R² Score:", r2)


Mean Squared Error: 0.5539786554937859
R² Score: 0.9736069300155528


In [79]:
data.isnull().sum()

Product Name               0
Category                   0
Location                   0
MRP                        0
Blinkit Price              0
Zepto Price                0
Instamart Price            0
Discount (%)               0
Margin (%)                 0
Festive/Seasonal Impact    0
Delivery Distance (km)     0
Shelf Life (days)          0
Min Stock                  0
Max Stock                  0
Customer Sentiment         0
Weight (g/ml)              0
Weight Unit                0
Order Year                 0
Order Month                0
Order Day                  0
Order Hour                 0
Order Time Category        0
dtype: int64

In [67]:
data.columns

Index(['Product Name', 'Category', 'Location', 'MRP', 'Blinkit Price',
       'Zepto Price', 'Instamart Price', 'Discount (%)', 'Margin (%)',
       'Festive/Seasonal Impact', 'Delivery Distance (km)',
       'Shelf Life (days)', 'Min Stock', 'Max Stock', 'Final Price',
       'Customer Sentiment', 'Weight (g/ml)', 'Weight Unit', 'Order Year',
       'Order Month', 'Order Day', 'Order Hour', 'Order Time Category'],
      dtype='object')