# Product Price Prediction Pipeline
This notebook loads scraped product data, performs feature engineering, trains a regression model, and saves it for use in Flask/Streamlit.

In [38]:
%pip install scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib





[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
# Load data
df = pd.read_csv('products.csv')
df.head()

Unnamed: 0,id,title,price,description,category,image,rating
0,1,"Fjallraven - Foldsack No. 1 Backpack, Fits 15 ...",109.95,Your perfect pack for everyday use and walks i...,men's clothing,https://fakestoreapi.com/img/81fPKd-2AYL._AC_S...,"{'rate': 3.9, 'count': 120}"
1,2,Mens Casual Premium Slim Fit T-Shirts,22.3,"Slim-fitting style, contrast raglan long sleev...",men's clothing,https://fakestoreapi.com/img/71-3HjGNDUL._AC_S...,"{'rate': 4.1, 'count': 259}"
2,3,Mens Cotton Jacket,55.99,great outerwear jackets for Spring/Autumn/Wint...,men's clothing,https://fakestoreapi.com/img/71li-ujtlUL._AC_U...,"{'rate': 4.7, 'count': 500}"
3,4,Mens Casual Slim Fit,15.99,The color could be slightly different between ...,men's clothing,https://fakestoreapi.com/img/71YXzeOuslL._AC_U...,"{'rate': 2.1, 'count': 430}"
4,5,John Hardy Women's Legends Naga Gold & Silver ...,695.0,"From our Legends Collection, the Naga was insp...",jewelery,https://fakestoreapi.com/img/71pWzhdJNwL._AC_U...,"{'rate': 4.6, 'count': 400}"


In [40]:
# Feature engineering for all 10 features
import ast
def extract_rate(x):
    if isinstance(x, dict) and 'rate' in x:
        return x['rate']
    if isinstance(x, str):
        try:
            d = ast.literal_eval(x)
            if isinstance(d, dict) and 'rate' in d:
                return d['rate']
        except:
            pass
    return x
if 'rating' in df.columns:
    df['rating'] = df['rating'].apply(extract_rate)
# Add dummy columns for missing features (for demo, real data should have these)
for col, default in [
    ('brand', 'premium'),
    ('reviews', 100),
    ('shipping', 'standard'),
    ('seller', 'high'),
    ('competition', 'medium'),
    ('demand', 'regular'),
    ('productAge', 'recent'),
    ('stock', 'medium')
]:
    if col not in df.columns:
        df[col] = default
# Encode categorical columns
df['category'] = df['category'].astype('category').cat.codes
df['brand'] = df['brand'].astype('category').cat.codes
df['shipping'] = df['shipping'].astype('category').cat.codes
df['seller'] = df['seller'].astype('category').cat.codes
df['competition'] = df['competition'].astype('category').cat.codes
df['demand'] = df['demand'].astype('category').cat.codes
df['productAge'] = df['productAge'].astype('category').cat.codes
df['stock'] = df['stock'].astype('category').cat.codes
# Ensure reviews is numeric
df['reviews'] = pd.to_numeric(df['reviews'], errors='coerce').fillna(0)
# Final feature set (order must match backend)
X = df[['category', 'brand', 'rating', 'reviews', 'shipping', 'seller', 'competition', 'demand', 'productAge', 'stock']]
y = df['price']

In [41]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print('Model trained!')

Model trained!


In [42]:
# Save model
joblib.dump(model, 'price_model.joblib')
print('Model saved as price_model.joblib')

Model saved as price_model.joblib
