In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("car_dataset.csv")

In [3]:
df.head()

Unnamed: 0,car_name,brand,car_model,model_year,transmission,body_type,fuel_type,engine_capacity,kilometers_run,price
0,Toyota Fielder 2011,Toyota,Fielder,2016,Automatic,Estate,CNG,1500,45852,1200000
1,Toyota Noah 2005,Toyota,Noah,2010,Automatic,Hatchback,CNG,1998,219630,1200000
2,Toyota LiteAce 2001,Toyota,LiteAce,2001,Manual,,CNG,1800,102000,165000
3,Maruti Suzuki Vitara Brezza 1994,Maruti Suzuki,Vitara Brezza,1994,Manual,,CNG,1600,113000,230000
4,Toyota Starlet good 1992,Toyota,Starlet,1992,Automatic,Saloon,CNG,1300,15000,330000


In [83]:
X = df.drop(columns="price")
Y = df["price"]

In [6]:
df.isnull().sum()

car_name            0
brand               0
car_model           0
model_year          0
transmission        0
body_type          18
fuel_type           0
engine_capacity     0
kilometers_run      0
price               0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   car_name         1209 non-null   object
 1   brand            1209 non-null   object
 2   car_model        1209 non-null   object
 3   model_year       1209 non-null   int64 
 4   transmission     1209 non-null   object
 5   body_type        1191 non-null   object
 6   fuel_type        1209 non-null   object
 7   engine_capacity  1209 non-null   int64 
 8   kilometers_run   1209 non-null   int64 
 9   price            1209 non-null   int64 
dtypes: int64(4), object(6)
memory usage: 94.6+ KB


In [8]:
df["body_type"].value_counts()

body_type
Saloon          623
SUV / 4x4       203
MPV             201
Hatchback        80
Estate           79
Convertible       4
Coupé/Sports      1
Name: count, dtype: int64

In [14]:
df["body_type"] = df["body_type"].fillna(df["body_type"].mode()[0])

In [15]:
df.isnull().sum()

car_name           0
brand              0
car_model          0
model_year         0
transmission       0
body_type          0
fuel_type          0
engine_capacity    0
kilometers_run     0
price              0
dtype: int64

In [18]:
df.drop(columns= ["car_name", "brand"],inplace=True)

In [19]:
df.head()

Unnamed: 0,car_model,model_year,transmission,body_type,fuel_type,engine_capacity,kilometers_run,price
0,Fielder,2016,Automatic,Estate,CNG,1500,45852,1200000
1,Noah,2010,Automatic,Hatchback,CNG,1998,219630,1200000
2,LiteAce,2001,Manual,Saloon,CNG,1800,102000,165000
3,Vitara Brezza,1994,Manual,Saloon,CNG,1600,113000,230000
4,Starlet,1992,Automatic,Saloon,CNG,1300,15000,330000


In [23]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']

In [24]:
cat_features

['car_model', 'transmission', 'body_type', 'fuel_type']

In [25]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']

In [26]:
num_features

['model_year', 'engine_capacity', 'kilometers_run', 'price']

In [28]:
discrete_features = [feature for feature in num_features if len(df[feature].unique())<25]
continious_features = [feature for feature in num_features if feature not in discrete_features]

In [27]:
df["model_year"].unique()

array([2016, 2010, 2001, 1994, 1992, 1998, 2009, 2000, 1996, 2002, 2003,
       2004, 2005, 2006, 2007, 2011, 2012, 2013, 2015, 1995, 1999, 1989,
       1997, 2008, 2014, 1983, 1990, 1991, 1993, 2017, 1987, 2018, 2019,
       2020, 2021], dtype=int64)

In [29]:
df.head()

Unnamed: 0,car_model,model_year,transmission,body_type,fuel_type,engine_capacity,kilometers_run,price
0,Fielder,2016,Automatic,Estate,CNG,1500,45852,1200000
1,Noah,2010,Automatic,Hatchback,CNG,1998,219630,1200000
2,LiteAce,2001,Manual,Saloon,CNG,1800,102000,165000
3,Vitara Brezza,1994,Manual,Saloon,CNG,1600,113000,230000
4,Starlet,1992,Automatic,Saloon,CNG,1300,15000,330000


In [84]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X["car_model"] = le.fit_transform(X["car_model"])

In [85]:
X.head()

Unnamed: 0,car_model,model_year,transmission,body_type,fuel_type,engine_capacity,kilometers_run
0,40,2016,Automatic,Estate,CNG,1500,45852
1,67,2010,Automatic,Hatchback,CNG,1998,219630
2,61,2001,Manual,Saloon,CNG,1800,102000
3,111,1994,Manual,Saloon,CNG,1600,113000
4,98,1992,Automatic,Saloon,CNG,1300,15000


In [87]:
X["transmission"].value_counts()

transmission
Automatic    1146
Manual         63
Name: count, dtype: int64

In [89]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

scaler = StandardScaler()
encoder = OneHotEncoder(drop="first")

one_hot_columns = X.select_dtypes(include='O').columns
num_features = X.select_dtypes(exclude='O').columns

preprocessor = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", encoder, one_hot_columns),
        ("StandardScaler", scaler, num_features)
    ],
    remainder="passthrough"
)
X = preprocessor.fit_transform(X)

In [94]:
X

<1209x34 sparse matrix of type '<class 'numpy.float64'>'
	with 7305 stored elements in Compressed Sparse Row format>

In [36]:
from sklearn.model_selection import train_test_split

In [91]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [60]:
def evalute_Model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    score = r2_score(true, predicted)
    rmse = np.sqrt(mse)
    return mae, mse, score, rmse

In [93]:
models = {
    "RF" : RandomForestRegressor(),
    "DT" : DecisionTreeRegressor(),
    "KNN" : KNeighborsRegressor(),
    "laso" : Lasso(),
    "Ridge" : Ridge(),
    "LR" : LinearRegression()
}

for name, model in models.items():
    estimator = model
    estimator.fit(X_train, Y_train)
    
    
    y_pred = estimator.predict(X_test)
    
    mae, mse, score, rmse = evalute_Model(Y_test, y_pred)
    
    print()
    
    print(f"-----error for {name} model------")
    
    print("mean absolute error : ",mae)
    print("mean squred error : ", mse)
    print("r2 score is : ", score)
    print("rmse : ", rmse)
    
    print()
    



error for RF model
mean absolute error :  303672.88122917485
mean squred error :  655393594158.2112
r2 score is :  0.7264846358138689
rmse :  809563.8295762794


error for DT model
mean absolute error :  355920.4903581267
mean squred error :  1048508382943.7797
r2 score is :  0.5624260676801027
rmse :  1023966.9833269917


error for KNN model
mean absolute error :  431843.78842975205
mean squred error :  948675060027.8159
r2 score is :  0.6040895015596225
rmse :  973999.5174679585


error for laso model
mean absolute error :  555894.136921501
mean squred error :  932688002347.6144
r2 score is :  0.6107613792566899
rmse :  965757.7348111763


error for Ridge model
mean absolute error :  560563.4179490263
mean squred error :  944750338527.1392
r2 score is :  0.6057274053172348
rmse :  971982.6842733049


error for LR model
mean absolute error :  555902.9286422714
mean squred error :  932679119822.2231
r2 score is :  0.6107650862004088
rmse :  965753.1360664708



  model = cd_fast.sparse_enet_coordinate_descent(


## hyper parameter tunning with random forest model

In [100]:
params = {
    "n_estimators" : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "criterion" : ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    "max_features" : ["sqrt", "log2", None],
    "max_depth" : [5,7, 8, 10],
    "min_samples_split" :[2, 4, 8, 10]
}

In [101]:
randomized_models = [
    ("Random forest",RandomForestRegressor(), params)
]

In [104]:
from sklearn.model_selection import RandomizedSearchCV




for name, model, params in randomized_models:
    estimator =  RandomizedSearchCV(model, params, random_state=42, n_jobs = -1, verbose = 2, cv= 3, n_iter = 100)
    estimator.fit(X_train, Y_train)
    
    print(estimator.best_params_)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 500, 'min_samples_split': 8, 'max_features': None, 'max_depth': 10, 'criterion': 'absolute_error'}


In [109]:
model = RandomForestRegressor(n_estimators=500, min_samples_split=8, max_features=None, max_depth=10, criterion="absolute_error",n_jobs=-10, verbose=2)

In [110]:
model.fit(X_train, Y_train)

building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
b

[Parallel(n_jobs=-10)]: Done  40 tasks      | elapsed:    1.6s


building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500
building tree 49 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 58 of 500
building tree 59 of 500
building tree 60 of 500
building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65 of 500
building tree 66 of 500
building tree 67 of 500
building tree 68 of 500
building tree 69 of 500
building tree 70 of 500
building tree 71 of 500
building tree 72 of 500
building tree 73 of 500
building tree 74 of 500
building tree 75 of 500
building tree 76 of 500
building tree 77 of 500
building tree 78 of 500
building tree 79 of 500
building tree 80 of 500
building tree 81 of 500
building tree 82 of 500
building tree 83 of 500
building tree 84 of 500
building tree 85 of 500
building tree 86

[Parallel(n_jobs=-10)]: Done 161 tasks      | elapsed:    6.7s


building tree 164 of 500
building tree 165 of 500
building tree 166 of 500
building tree 167 of 500
building tree 168 of 500
building tree 169 of 500
building tree 170 of 500
building tree 171 of 500
building tree 172 of 500
building tree 173 of 500
building tree 174 of 500
building tree 175 of 500
building tree 176 of 500
building tree 177 of 500
building tree 178 of 500
building tree 179 of 500
building tree 180 of 500
building tree 181 of 500
building tree 182 of 500
building tree 183 of 500
building tree 184 of 500
building tree 185 of 500
building tree 186 of 500
building tree 187 of 500
building tree 188 of 500
building tree 189 of 500
building tree 190 of 500
building tree 191 of 500
building tree 192 of 500
building tree 193 of 500
building tree 194 of 500
building tree 195 of 500
building tree 196 of 500
building tree 197 of 500
building tree 198 of 500
building tree 199 of 500
building tree 200 of 500
building tree 201 of 500
building tree 202 of 500
building tree 203 of 500


[Parallel(n_jobs=-10)]: Done 364 tasks      | elapsed:   15.4s


building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500
building tree 382 of 500
building tree 383 of 500
building tree 384 of 500
building tree 385 of 500
building tree 386 of 500
building tree 387 of 500
building tree 388 of 500
building tree 389 of 500
building tree 390 of 500
building tree 391 of 500
building tree 392 of 500
building tree 393 of 500
building tree 394 of 500
building tree 395 of 500
building tree 396 of 500
building tree 397 of 500
building tree 398 of 500
building tree 399 of 500
building tree 400 of 500
building tree 401 of 500
building tree 402 of 500
building tree 403 of 500
building tree 404 of 500
building tree 405 of 500
building tree 406 of 500
building tree 407 of 500
building tree 408 of 500


In [111]:
 y_pred = model.predict(X_test)
    
mae, mse, score, rmse = evalute_Model(Y_test, y_pred)

print()

print(f"-----error for {name} model------")

print("mean absolute error : ",mae)
print("mean squred error : ", mse)
print("r2 score is : ", score)
print("rmse : ", rmse)

print()


-----error for Random forest model------
mean absolute error :  316625.879707989
mean squred error :  592228061167.2412
r2 score is :  0.7528455034116466
rmse :  769563.5523900812



[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    0.1s
