In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.linear_model import Ridge

In [2]:
cars = pd.read_csv('cleaned_data.csv')

In [3]:
cars.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model,Age
0,Mumbai,2010,72000,CNG,Manual,First,26.6,998,58.16,5.0,1.75,Maruti,Wagon,10
1,Pune,2015,41000,Diesel,Manual,First,31.6687,1582,126.2,5.0,12.5,Hyundai,Creta,5
2,Chennai,2011,46000,Petrol,Manual,First,29.302,1199,88.7,5.0,4.5,Honda,Jazz,9
3,Chennai,2012,87000,Diesel,Manual,First,33.4397,1248,88.76,7.0,6.0,Maruti,Ertiga,8
4,Coimbatore,2013,40670,Diesel,Automatic,Second,24.472,1968,140.8,5.0,17.74,Audi,A4,7


In [4]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5849 entries, 0 to 5848
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           5849 non-null   object 
 1   Year               5849 non-null   int64  
 2   Kilometers_Driven  5849 non-null   int64  
 3   Fuel_Type          5849 non-null   object 
 4   Transmission       5849 non-null   object 
 5   Owner_Type         5849 non-null   object 
 6   Mileage            5849 non-null   float64
 7   Engine             5849 non-null   int64  
 8   Power              5776 non-null   float64
 9   Seats              5849 non-null   float64
 10  Price              5849 non-null   float64
 11  Brand              5849 non-null   object 
 12  Model              5849 non-null   object 
 13  Age                5849 non-null   int64  
dtypes: float64(4), int64(4), object(6)
memory usage: 639.9+ KB


In [5]:

num_cols = ['Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Year']
nom_cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Brand', 'Model']  
ord_cat_cols = ['Owner_Type']  

In [9]:
X = cars.drop(['Price', 'Age', 'Seats'], axis= 1)
y = cars['Price'].copy()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
from category_encoders import BinaryEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy= 'median')),
                                  ('poly', PolynomialFeatures()),
                           ('scaler', StandardScaler())])
ord_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']]))])
nom_transformer = Pipeline(steps=[('binary', BinaryEncoder())])
preprocessor = ColumnTransformer(transformers=[('num_prep', num_transformer, num_cols),
                                               ('nom_prep', nom_transformer, nom_cat_cols),
                                               ('ord_prep', ord_transformer, ord_cat_cols)])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', Ridge())])

param_grid = {
    'preprocessor__num_prep__poly__degree':[2,3,4],
    'model__alpha':[0.01, 0.1, 1, 10]
    
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)


0,1,2
,estimator,"Pipeline(step...l', Ridge())])"
,param_grid,"{'model__alpha': [0.01, 0.1, ...], 'preprocessor__num_prep__poly__degree': [2, 3, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num_prep', ...), ('nom_prep', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,3
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,mapping,
,drop_invariant,False
,return_df,True
,base,2
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,categories,"[['Fourth & Above', 'Third', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,alpha,0.01
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [43]:
grid_search.best_params_

{'model__alpha': 0.01, 'preprocessor__num_prep__poly__degree': 3}