In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
car_df = pd.read_csv("cleaned_car_data.csv")

#### Extracting Training Data


In [3]:
X =  car_df[['name', 'company', 'year', 'kms_driven', 'fuel_type']]
y = car_df["Price"]

In [77]:
cv_scores = cross_val_score(pipe, X, y, cv=10, scoring='r2')  # You can change 'r2' to 'neg_mean_squared_error', etc.


#### Applying TRain Test Split

In [6]:
X_train , X_test , y_train , y_test =train_test_split(X , y , test_size = 0.3 )

#### Creating an OneHotEncoder object to contain all the possible categories


In [7]:
ohe = OneHotEncoder()

In [8]:
ohe.fit(X[["name" , "company" , "fuel_type"]])

In [9]:
ohe.categories_

[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
        'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
        'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
        'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat Diesel',
        'Chevrolet Beat LS', 'Chevrolet Beat LT', 'Chevrolet Beat PS',
        'Chevrolet Cruze LTZ', 'Chevrolet Enjoy', 'Chevrolet Enjoy 1.4',
        'Chevrolet Sail 1.2', 'Chevrolet Sail UVA', 'Chevrolet Spark',
        'Chevrolet Spark 1.0', 'Chevrolet Spark LS', 'Chevrolet Spark LT',
        'Chevrolet Tavera LS', 'Chevrolet Tavera Neo', 'Datsun GO T',
        'Datsun Go Plus', 'Datsun Redi GO', 'Fiat Linea Emotion',
        'Fiat Petra ELX', 'Fiat Punto Emotion', 'Force Motors Force',
        'Force Motors One', 'Ford EcoSport', 'Ford EcoSport Ambiente',
        'Ford EcoSport Titanium', 'Ford EcoSport Trend',
        'Ford Endeavor 4x4', 'Ford Fiesta', 'Ford Fiesta SXi', 'Ford Figo',
        '

#### Creating a column transformer to transform categorical columns


In [27]:
column_trans = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", OneHotEncoder(categories=ohe.categories_), ["name", "company", "fuel_type"]),
        ("scaler", StandardScaler(), ["year", "kms_driven"])
    ],
    remainder='passthrough'
)

In [28]:
column_trans

#### Linear Regression Model 

In [29]:
lr = LinearRegression()

#### Making PipeLine

In [30]:
pipe = Pipeline([("preprocessor" , column_trans) ,
                 
                 ("Model" , lr)])

In [31]:
pipe

#### Fitting Thr Model

In [32]:
pipe.fit(X_train , y_train)

In [33]:
y_pred = pipe.predict(X_test)

In [34]:
y_pred

array([ 251273.201855  ,  522034.83555564,  299623.38037674,
        472717.19473335,  141234.93474299,  262848.17755229,
        534067.15348664,  500997.51523636,   60032.79327017,
        277833.23118205,  360997.13090067,  182011.82842322,
       1108566.0931138 ,  663822.97519535,  585912.0069479 ,
        426158.77549168,  282324.5433346 ,  299632.33009122,
        290480.79524816,  242580.84526703,  408224.71841867,
        146685.53775118, 1558549.95771995,  180712.79169148,
         82822.0989428 ,  461486.5020009 ,  269947.39776799,
        660614.87520785,  358081.58852354,  566343.90131113,
        571572.67773391,  208976.61663759,  428665.68936168,
        680304.82716588,    8184.02483122,   63521.32302135,
        723240.20209816,   92738.81663773,  507478.54269615,
        782029.58451684,  469886.23192689,  162204.31840854,
        227434.57546881,  413829.47973544,  186953.58963631,
        219245.87255139,   31030.18559765,  272656.66440234,
        358318.36300578,

In [36]:
r2_score(y_test , y_pred)

0.8290353648001109

#### Finding the model with a random state of TrainTestSplit where the model was found to give almost 0.92 as r2_score


In [37]:
scores = []

for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    
    lr = LinearRegression()
    
    pipe = Pipeline([
        ("preprocessing", column_trans),
        ("model", lr)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    scores.append(r2_score(y_test, y_pred))

print("Best R² score:", max(scores))
print("Average R² score:", sum(scores)/len(scores))

Best R² score: 0.9198885323149603
Average R² score: 0.2965643986382824


In [38]:
np.argmax(scores)

np.int64(996)

In [39]:
scores[np.argmax(scores)]

0.9198885323149603

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=np.argmax(scores))
    
lr = LinearRegression()
    
pipe = Pipeline([
        ("preprocessing", column_trans),
        ("model", lr)
    ])
    
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.9198885323149603

In [42]:
import pickle

In [43]:
pickle.dump(pipe,open("LinearRegressionModel.pkl" , "wb"))

In [71]:
new_data = pd.DataFrame([["Mahindra Jeep CL550", "Mahindra", 2007, 4500, "Diesel"]],
                        columns=["name", "company", "year", "kms_driven", "fuel_type"])

In [72]:
y_pred = pipe.predict(new_data)

In [73]:
y_pred

array([233655.39594185])

In [61]:
car_df

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [80]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
import numpy as np


In [81]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}


In [82]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)


In [83]:
for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", column_trans),
        ("model", model)
    ])
    
    scores = cross_val_score(pipe, X, y, cv=cv, scoring='r2')
    
    print(f"{name} R² scores: {np.round(scores, 3)}")
    print(f"Average R²: {scores.mean():.4f}")
    print(f"Standard Deviation: {scores.std():.4f}")
    print("-" * 50)


LinearRegression R² scores: [ 0.104  0.805  0.69   0.637  0.533  0.663  0.793  0.744 -0.595  0.862]
Average R²: 0.5235
Standard Deviation: 0.4244
--------------------------------------------------
Ridge R² scores: [0.108 0.792 0.674 0.667 0.547 0.666 0.749 0.762 0.196 0.831]
Average R²: 0.5992
Standard Deviation: 0.2368
--------------------------------------------------
RandomForest R² scores: [0.082 0.736 0.625 0.67  0.384 0.714 0.73  0.721 0.626 0.855]
Average R²: 0.6144
Standard Deviation: 0.2113
--------------------------------------------------


In [84]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


In [85]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10]
}


In [86]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10]
}


In [87]:
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,  # Use all CPU cores
    verbose=2
)


In [88]:
grid_search.fit(X, y)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [89]:
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)


Best Parameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best R² Score: 0.44201723891656874
