In [19]:
import pandas as pd, numpy as np, joblib, pathlib, random
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
print("✅ Libraries ready")


✅ Libraries ready


In [20]:
RAW_COLS = {
    'name'           : 'name',
    'year'           : 'year',
    'selling_price'  : 'selling_price',
    'km_driven'      : 'km_driven',
    'fuel'           : 'fuel',
    'seller_type'    : 'seller_type',
    'transmission'   : 'transmission',
    'owner'          : 'owner'
}

df = pd.read_csv('Cardetails.csv')[list(RAW_COLS.values())].copy()
print("Rows:", len(df))
df.head()


Rows: 8128


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner


In [21]:
# Brand = first word, Model = name without brand
df['brand']  = df['name'].str.split().str[0]
df['model']  = df.apply(lambda r: r['name'].replace(r['brand']+' ', '', 1), axis=1)

# We keep only the 8 desired features + target
keep = ['brand','model','year','km_driven','fuel',
        'seller_type','transmission','owner','selling_price']
df = df[keep].dropna()

X = df.drop('selling_price', axis=1)
y = df['selling_price']


In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

cat_cols = ['brand','model','fuel','seller_type','transmission','owner']
num_cols = ['year','km_driven']

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', 'passthrough', num_cols)
    ],
    sparse_threshold=0          # <‑‑ force dense matrix
)

hgb  = HistGradientBoostingRegressor(random_state=42)

pipe = Pipeline([
    ('prep', preprocess),
    ('reg',  hgb)
])


In [None]:
param_grid = {
    'reg__learning_rate':  np.linspace(0.02, 0.2, 10),
    'reg__max_depth':      [None, 4, 6, 8],
    'reg__l2_regularization':[0.0, 0.1, 0.5, 1.0]
}
search = RandomizedSearchCV(
    pipe, param_grid, n_iter=20, scoring='neg_mean_absolute_error',
    cv=3, random_state=42, n_jobs=-1
).fit(X, y)

print("Best MAE (CV):",
      abs(search.best_score_).round(0),
      "\nBest params:", search.best_params_)
best_pipe = search.best_estimator_


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
best_pipe.fit(X_train, y_train)
pred = best_pipe.predict(X_test)
print("MAE : ₹{:,.0f}".format(mean_absolute_error(y_test, pred)))
print("R²  : {:.3f}".format(r2_score(y_test, pred)))


MAE : ₹127,979
R²  : 0.911


In [None]:
import joblib
from pathlib import Path

# Make sure the folder exists (capital M, as in your tree)
Path("Model").mkdir(exist_ok=True)

# Save the trained pipeline (best_pipe) as car_model.pkl
joblib.dump(best_pipe, "Model/car_model.pkl")

print("✅  Model saved to Model/car_model.pkl")


✅  Model saved to Model/car_model.pkl
