In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('8cars.csv')
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [5]:
X = df.drop("selling_price", axis=1)
X

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,90000,Diesel,Individual,Manual,First Owner


In [6]:
y = df["selling_price"]
y

0        60000
1       135000
2       600000
3       250000
4       450000
         ...  
4335    409999
4336    409999
4337    110000
4338    865000
4339    225000
Name: selling_price, Length: 4340, dtype: int64

In [None]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
num_cols

Index(['year', 'km_driven'], dtype='object')

In [9]:
cat_cols = X.select_dtypes(include=["object"]).columns
cat_cols

Index(['name', 'fuel', 'seller_type', 'transmission', 'owner'], dtype='object')

In [17]:
#Preprocessing Pipeline:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [19]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [20]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001),
    "SVR": SVR(kernel="rbf", C=100),
    "Random Forest": RandomForestRegressor(
        n_estimators=200, random_state=42, n_jobs=-1
    ),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

***Training + Evaluation Loop:***

In [21]:
results = []

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    results.append({
        "Model": name,
        "R2": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": mean_squared_error(y_test, y_pred) ** 0.5
    })

  model = cd_fast.sparse_enet_coordinate_descent(


In [22]:
results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)
results_df

Unnamed: 0,Model,R2,MAE,RMSE
0,Linear Regression,0.619265,120932.270291,340864.910114
1,Ridge,0.609111,136106.310292,345380.350754
4,Random Forest,0.573884,118816.258861,360607.9063
5,Gradient Boosting,0.53856,169221.9313,375257.129508
2,Lasso,0.518911,123101.74558,383163.363444
3,SVR,-0.025521,277403.343455,559426.987054


***

***R² Score (Coefficient of Determination):***
<br>*R² tells you how much of the variation in the target variable is explained by the model.*

*In simple words:*
<br>*How well does my model explain why prices go up and down?*

***Range:***
- *1.0 → perfect model*
- *0.0 → model is no better than predicting the mean price*
- *< 0 → model is worse than a dumb average prediction*

***Formula:***
> *R2 = 1 − (Error of Model / Error of Mean Baseline)*
	

***

***MAE (Mean Absolute Error)***
<br>*MAE is the average absolute mistake your model makes.*

*In simple words:*<br>
*On average, how much money am I wrong by?*

***Why MAE is important:***
- *Easy to interpret*
- *Same unit as target (₹)*
- *Treats all errors equally*

***Formula:***
> MAE = 1/n ∑ ∣ *y_true* − *y_pred* ∣

*eg:*
<br>*in our project: MAE ≈ 120,932*
<br>*means on average, the model predicts car prices ₹1.2 lakh away from the true price*

*well his is not bad when:*
- *cars cost ₹5–15 lakh*
- *prices vary a lot*

***

***RMSE (Root Mean Squared Error)***
<br>*RMSE penalizes large mistakes more heavily.*

*In simple words:*<br>
*How bad are my worst mistakes?*

***Why RMSE is always ≥ MAE,***
<br>because:
- *errors are squared*
- *large errors hurt more*

***Formula:***
> RMSE = rootover(1/n ∑(*y_true − y_pred*)^2
	​
*eg:*
<br>*in our project: MAE ≈ RMSE ≈ 340,864*
<br>*means The model makes some big mistakes (likely on expensive cars)*

*This tells us:*
- *Dataset has **price outliers***
- *Model struggles on luxury / rare cars*

***

***Metric Comparison:***

| Metric           | What it tells                              |
| ---------------- | ------------------------------------------ |
| **R² = 0.62**    | Model explains majority of price variation |
| **MAE ≈ ₹1.2L**  | Typical prediction mistake                 |
| **RMSE ≈ ₹3.4L** | Large errors exist (outliers)              |


***

> *R² measures explanatory power, MAE measures average absolute error in monetary terms, and RMSE penalizes large prediction errors, making it sensitive to outliers.*

***

***Why Linear Regression looks best here:***
- *Strong linear trends in car data*
- *RF needs tuning*
- *SVR failed due to scaling + skewed target*