In [1]:
# 1. Importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

In [2]:
# 2. Loading cleaned dataset

df = pd.read_csv('../data/cleaned_data_final.csv')
df.head()

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,discount_percent
0,"Redmi 10 Power (Power Black, 8GB RAM, 128GB St...","tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/81eM15lVcJ...,https://www.amazon.in/Redmi-Power-Black-128GB-...,4.0,965,10999.0,18999.0,42.11
1,"OnePlus Nord CE 2 Lite 5G (Blue Tide, 6GB RAM,...","tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/71AvQd3Vzq...,https://www.amazon.in/OnePlus-Nord-Lite-128GB-...,4.3,113956,18999.0,19999.0,5.0
2,OnePlus Bullets Z2 Bluetooth Wireless in Ear E...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/51UhwaQXCp...,https://www.amazon.in/Oneplus-Bluetooth-Wirele...,4.2,90304,1999.0,2299.0,13.05
3,"Samsung Galaxy M33 5G (Mystique Green, 6GB, 12...","tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/81I3w4J6yj...,https://www.amazon.in/Samsung-Mystique-Storage...,4.1,24863,15999.0,24999.0,36.0
4,"OnePlus Nord CE 2 Lite 5G (Black Dusk, 6GB RAM...","tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/71V--WZVUI...,https://www.amazon.in/OnePlus-Nord-Black-128GB...,4.3,113956,18999.0,19999.0,5.0


In [3]:
# 3. Feature engineering

# Create new features based on product name
df['name_length'] = df['name'].apply(len)
df['keyword_boat'] = df['name'].str.contains('boAt', case=False).astype(int)
df['keyword_amazon'] = df['name'].str.contains('Amazon', case=False).astype(int)

# Define feature set and target
features = df[['ratings', 'no_of_ratings', 'actual_price', 'discount_percent', 
               'name_length', 'keyword_boat', 'keyword_amazon']]
target = df['discount_price']

In [4]:
# 4. Splitting dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [5]:
# 5. Training baseline models

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

In [7]:
# 6. Evaluation

def evaluate(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{name} Performance:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE : {mae:.2f}\n")

evaluate(rf_model, X_test, y_test, "Random Forest")
evaluate(gb_model, X_test, y_test, "Gradient Boosting")

Random Forest Performance:
RMSE: 439.70
MAE : 81.11

Gradient Boosting Performance:
RMSE: 556.17
MAE : 169.70



In [8]:
# 7. Saving the best model

joblib.dump(rf_model, "../models/random_forest_model.joblib")

['../models/random_forest_model.joblib']

In [None]:
!streamlit run "/Users/macbookpro/Library/CloudStorage/OneDrive-Personal/Work/Projets Solo/Projet 2/app/price_prediction.py"