In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv("../data/used_cars_data_cleaned.csv")
df.head()

Unnamed: 0,Brand_Name,Model_Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Wagon R LXI CNG,Mumbai,2010,72000.0,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai,Creta 1.6 CRDi SX Option,Pune,2015,41000.0,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda,Jazz V,Chennai,2011,46000.0,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti,Ertiga VDI,Chennai,2012,87000.0,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi,A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670.0,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


### Feature Engineering (safe ones)

In [3]:
X = df.drop(columns=["Price"])
y = df["Price"]

# Feature Engineering (safe ones)
CURRENT_YEAR = 2020
X["Car_Age"] = CURRENT_YEAR - X["Year"]
X.drop(columns=["Year"], inplace=True)

## Log Transformation
X["Kilometers_Driven_log"] = np.log1p(X["Kilometers_Driven"])
X["Engine_log"] = np.log1p(X["Engine"])
X["Power_log"] = np.log1p(X["Power"])
##y_log = np.log1p(y)

X.drop(columns=["Kilometers_Driven", "Engine", "Power"], inplace=True)

#One-Hot Encoding (Low Cardinality)
## low_card_cols = ['Fuel_Type', 'Transmission', 'Seats','Location']
##X = pd.get_dummies(X, columns=low_card_cols, drop_first=True)

# Frequency Encoding (High Cardinality)
#for col in ['Brand_Name', 'Model_Name']:
#    freq = df[col].value_counts()
#    X[f'{col}_freq'] = df[col].map(freq)


# Ordinal Encoding
owner_map = {"First": 1, "Second": 2, "Third": 3, "Fourth & Above": 4}
X["Owner_Type"] = X["Owner_Type"].map(owner_map)


# Feature Interaction (Optional but Powerful)
X["Power_per_CC"] = X["Power_log"] / X["Engine_log"]


### Training and Testing the model

In [4]:
# Split first (IMPORTANT to avoid leakage later)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Columns
cat_cols = ["Brand_Name", "Model_Name", "Fuel_Type", "Transmission", "Location"]
num_cols = [c for c in X_train.columns if c not in cat_cols]

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

model = RandomForestRegressor(
    n_estimators=400,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Random Forest Pipeline Results")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"R2   : {r2:.3f}")


Random Forest Pipeline Results
MAE  : 1.545
RMSE : 4.293
R2   : 0.862


### âœ… Save the Model

In [5]:
import joblib

joblib.dump(pipeline, "../models/rf_used_car_price_model.pkl")
print("Model saved successfully!")


Model saved successfully!
