In [22]:
df = pd.read_csv("data/dataset.csv")
print(df.columns)


Index(['brand', 'model', 'year', 'age', 'mileage', 'mileage_per_year',
       'engine_size', 'fuel_type', 'transmission', 'condition',
       'condition_score', 'owner_count', 'color', 'body_type', 'is_luxury',
       'mpg_estimate', 'price_usd', 'price_local', 'price_category',
       'market_segment', 'depreciation_rate', 'value_retention',
       'insurance_estimate', 'power_to_weight_ratio', 'regional_factor',
       'source'],
      dtype='object')


In [28]:
# -----------------------------
# 1️⃣ Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# -----------------------------
# 2️⃣ Load Dataset
# -----------------------------
df = pd.read_csv("data/dataset.csv")  # Replace with your dataset path

# -----------------------------
# 3️⃣ Data Cleaning
# -----------------------------
# Fill missing numeric values with 0
numeric_cols = ['year', 'age', 'mileage', 'mileage_per_year', 'engine_size', 'owner_count', 'price_local']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Fill missing categorical values with 'Unknown'
cat_cols = ['brand', 'model', 'fuel_type', 'transmission', 'condition', 'color', 'body_type']
for col in cat_cols:
    df[col] = df[col].fillna('Unknown').astype(str)

# -----------------------------
# 4️⃣ Select Features (Reduced Set)
# -----------------------------
features = ['brand', 'model', 'fuel_type', 'transmission', 'condition', 'color', 'body_type',
            'year', 'age', 'mileage', 'mileage_per_year', 'engine_size', 'owner_count']
target = 'price_local'

df_model = df[features + [target]].copy()

# -----------------------------
# 5️⃣ Encode Categorical Columns
# -----------------------------
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    encoders[col] = le

# Save encoders
joblib.dump(encoders, "label_encoders.pkl")

# -----------------------------
# 6️⃣ Train/Test Split
# -----------------------------
X = df_model.drop(columns=[target])
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# 7️⃣ Train XGBoost Model
# -----------------------------
xg = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xg.fit(X_train, y_train)

# Save model
joblib.dump(xg, "car_price_xgb_model.pkl")

# -----------------------------
# 8️⃣ Predict & Evaluate
# -----------------------------
y_pred = xg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # Use np.sqrt to avoid old sklearn issue
r2 = r2_score(y_test, y_pred)

print("XGBoost -> RMSE:", round(rmse,0), "R2:", round(r2,4))

# -----------------------------
# 9️⃣ Save Processed Dataset
# -----------------------------
df_model.to_csv("processed_car_dataset.csv", index=False)

# -----------------------------
# 🔟 Function to Predict New Car Price
# -----------------------------
def predict_car_price(car_details: dict):
    df_input = pd.DataFrame([car_details])
    
    # Encode categorical columns
    for col in cat_cols:
        if df_input[col].iloc[0] not in encoders[col].classes_:
            raise ValueError(f"Unknown label '{df_input[col].iloc[0]}' in column '{col}'")
        df_input[col] = encoders[col].transform(df_input[col].values)
    
    # Ensure columns are in the same order as training
    feature_order = ['brand', 'model', 'fuel_type', 'transmission', 'condition', 'color', 
                     'body_type', 'year', 'age', 'mileage', 'mileage_per_year', 
                     'engine_size', 'owner_count']
    df_input = df_input[feature_order]
    
    # Predict
    model = joblib.load("car_price_xgb_model.pkl")
    predicted_price = model.predict(df_input)[0]
    return round(predicted_price, 0)


# -----------------------------
# 1️⃣1️⃣ Test Prediction
# -----------------------------
sample_car = {
    'brand': 'Audi', 'model': 'A3', 'year': 2016, 'age': 8, 'mileage': 135798,
    'mileage_per_year': 16974.75, 'engine_size': 2.0, 'fuel_type': 'Diesel',
    'transmission': 'Automatic', 'condition': 'Good', 'owner_count': 1,
    'color': 'Red', 'body_type': 'Hatchback'
}

pred_price = predict_car_price(sample_car)
print(f"Predicted Price (INR): ₹{pred_price:,}")


XGBoost -> RMSE: 5259.0 R2: 0.9461
Predicted Price (INR): ₹28,175.0
