In [2]:
# backend_price_estimator.py

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import joblib

# ---------------- Step 1: Load Data ----------------
df = pd.read_csv("car_data.csv", encoding="latin1")

# ---------------- Step 2: Clean Price Column ----------------
def clean_price(price):
    if pd.isnull(price):
        return np.nan
    price = str(price).replace("$", "").replace(",", "").strip()
    if "-" in price:  # Handle range e.g. 12000-15000
        parts = price.split("-")
        try:
            return (float(parts[0]) + float(parts[1])) / 2
        except:
            return np.nan
    try:
        return float(price)
    except:
        return np.nan

df["Cars Prices"] = df["Cars Prices"].apply(clean_price)

# ---------------- Step 3: Clean Numeric Features ----------------
def extract_number(value):
    if pd.isnull(value):
        return np.nan
    num = re.findall(r"[\d.]+", str(value))
    if num:
        return float(num[0])
    return np.nan

df["CC/Battery Capacity"] = df["CC/Battery Capacity"].apply(extract_number)
df["HorsePower"] = df["HorsePower"].apply(extract_number)
df["Total Speed"] = df["Total Speed"].apply(extract_number)
df["Performance(0 - 100 )KM/H"] = df["Performance(0 - 100 )KM/H"].apply(extract_number)
df["Torque"] = df["Torque"].apply(extract_number)
df["Seats"] = pd.to_numeric(df["Seats"], errors='coerce')

# ---------------- Step 4: Drop NaNs ----------------
df.dropna(inplace=True)

# ---------------- Step 5: Encode Categorical Columns ----------------
label_encoders = {}
for col in ["Company Names", "Cars Names", "Engines", "Fuel Types"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# ---------------- Step 6: Define Features & Target ----------------
X = df.drop(columns=["Cars Prices"])
y = df["Cars Prices"]

# ---------------- Step 7: Train-Test Split ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------- Step 8: Train Model ----------------
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# ---------------- Step 9: Save Model & Encoders ----------------
joblib.dump(model, "car_price_model.pkl")
joblib.dump(label_encoders, "car_price_encoders.pkl")
joblib.dump(list(X.columns), "car_price_features.pkl")

print("✅ Model training completed & files saved!")


✅ Model training completed & files saved!
