In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv('cardekho_imputated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [23]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [24]:
df.drop(columns=['car_name','brand'],axis=1,inplace=True)
df.drop(columns=df.columns[0],axis=1,inplace=True)
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [25]:
from sklearn.model_selection import train_test_split
X = df.drop(['selling_price'],axis=1)
y = df['selling_price']


In [26]:
from sklearn.preprocessing import LabelEncoder
e = LabelEncoder()
X['model'] = e.fit_transform(X['model'])

In [27]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

scaler = StandardScaler()
onehot = OneHotEncoder(drop='first')

num_features = [features for features in X.columns if X[features].dtype !='O']
cat_features = [features for features in X.columns if X[features].dtype =='O']

In [29]:
preprocess = ColumnTransformer([
    ('OneHotEncoder',onehot,cat_features),
    ('StandardScaler',scaler,num_features)
],remainder='passthrough')

In [32]:
X =preprocess.fit_transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
X_train

array([[ 0.        ,  0.        ,  0.        , ..., -0.76773286,
        -0.89566754, -0.40302241],
       [ 0.        ,  0.        ,  1.        , ...,  0.92487372,
         0.9404295 , -0.40302241],
       [ 0.        ,  0.        ,  0.        , ..., -0.55087963,
        -0.61874036, -0.40302241],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.9366097 ,
        -0.78070786, -0.40302241],
       [ 0.        ,  0.        ,  0.        , ..., -0.55471774,
        -0.43582879, -0.40302241],
       [ 1.        ,  0.        ,  0.        , ..., -0.04616815,
         0.06194201, -0.40302241]])

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def evaluate(y_test,y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
    return mse, mae, r2


In [35]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'kneigh': KNeighborsRegressor(),
    'forest': RandomForestRegressor()
    }

In [38]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)

    mse, mae, r2 = evaluate(y_test,y_pred)

    print(f"The result for {model}:")
    print(f" MSE:{mse}")
    print(f" MAE: {mae}")
    print(f"r2 :{r2}")

The result for LinearRegression():
 MSE:270286925822.75293
 MAE: 284283.445953383
r2 :0.6524693637784766
The result for Ridge():
 MSE:270275613895.96524
 MAE: 284241.11291362526
r2 :0.6524839084742715
The result for Lasso():
 MSE:270286207881.65326
 MAE: 284283.78899632714
r2 :0.6524702868957395
The result for KNeighborsRegressor():
 MSE:78779566186.25098
 MAE: 116579.3732156761
r2 :0.8987064850635158
The result for RandomForestRegressor():
 MSE:57766821186.1278
 MAE: 105052.0108942302
r2 :0.9257243388365921


In [39]:
knn_params = {"n_neighbors":[2,3,10,20,40,50]}
rf_params = {
    'n_estimators': [50, 100, 200, 300, 500],  
    'min_samples_split': [2, 8, 15, 20],  
    'max_depth': [None, 5, 8, 10, 15],  
    'max_features': [5 ,7,8]
}

randomcv = [
    ('KNN', KNeighborsRegressor(),knn_params),
    ('RF',RandomForestRegressor(),rf_params)
]

In [40]:
from sklearn.model_selection import RandomizedSearchCV
model_param = {}
for name, model, params in randomcv:
    random = RandomizedSearchCV(estimator=model, 
                                param_distributions=params, 
                                n_iter=100, 
                                cv=3, 
                                n_jobs=-1)
    
    random.fit(X_train,y_train)
    model_param[name] = random.best_params_

    y_pred = random.predict(X_test)
    mse, mae, r2 = evaluate(y_test,y_pred)

    print(f"The result for {name}:")
    print(f" MSE:{mse}")
    print(f" MAE: {mae}")
    print(f"r2 :{r2}")
    



The result for KNN:
 MSE:87994611511.97119
 MAE: 121765.48793148196
r2 :0.8868579261474333
The result for RF:
 MSE:51981399937.306786
 MAE: 101446.1257456331
r2 :0.9331631415877499
