In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

In [4]:
df = pd.read_csv('datasets/cardekho_imputated.csv',index_col=0)
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


DATA CLEANING 

1. handling missing values
2. handling outliers
3. handling duplicates
4. handling categorical data


In [5]:
df.duplicated().sum()

np.int64(167)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15411 entries, 0 to 19543
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_name           15411 non-null  object 
 1   brand              15411 non-null  object 
 2   model              15411 non-null  object 
 3   vehicle_age        15411 non-null  int64  
 4   km_driven          15411 non-null  int64  
 5   seller_type        15411 non-null  object 
 6   fuel_type          15411 non-null  object 
 7   transmission_type  15411 non-null  object 
 8   mileage            15411 non-null  float64
 9   engine             15411 non-null  int64  
 10  max_power          15411 non-null  float64
 11  seats              15411 non-null  int64  
 12  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 1.6+ MB


In [7]:
# dropping duplicates
df.drop_duplicates(inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15244 entries, 0 to 19543
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_name           15244 non-null  object 
 1   brand              15244 non-null  object 
 2   model              15244 non-null  object 
 3   vehicle_age        15244 non-null  int64  
 4   km_driven          15244 non-null  int64  
 5   seller_type        15244 non-null  object 
 6   fuel_type          15244 non-null  object 
 7   transmission_type  15244 non-null  object 
 8   mileage            15244 non-null  float64
 9   engine             15244 non-null  int64  
 10  max_power          15244 non-null  float64
 11  seats              15244 non-null  int64  
 12  selling_price      15244 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 1.6+ MB


In [9]:
df.tail()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
19537,Hyundai i10,Hyundai,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5,250000
19540,Maruti Ertiga,Maruti,Ertiga,2,18000,Dealer,Petrol,Manual,17.5,1373,91.1,7,925000
19541,Skoda Rapid,Skoda,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5,425000
19542,Mahindra XUV500,Mahindra,XUV500,5,3800000,Dealer,Diesel,Manual,16.0,2179,140.0,7,1225000
19543,Honda City,Honda,City,2,13000,Dealer,Petrol,Automatic,18.0,1497,117.6,5,1200000


In [10]:
# checking for missing values
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [11]:
# removing unncessary columns
df.drop(['car_name', 'brand'], axis=1, inplace=True)

In [15]:
df.model.unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [17]:
# getting all different types of features
num_features = df.select_dtypes(include=[np.number]).columns.tolist()
cat_features = df.select_dtypes(exclude=[np.number]).columns.tolist()
discrete_features = [col for col in num_features if len(df[col].unique()) < 20]
continous_features = [col for col in num_features if col not in discrete_features]
print(f"Numerical Features: {len(num_features)}")
print(f"Categorical Features: {len(cat_features)}")
print(f"Discrete Features: {len(discrete_features)}")
print(f"Continuous Features: {len(continous_features)}")

Numerical Features: 7
Categorical Features: 4
Discrete Features: 1
Continuous Features: 6


In [18]:
# dependent and independent features
X = df.drop('selling_price', axis=1)
y = df['selling_price']

FEATURE ENCODING AND SCALING

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

In [21]:
# creating column transformer with 3 types of transformers
numerical_features = num_features
onehot_features = [col for col in cat_features if col not in ['model']]
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), onehot_features),
    ],
    remainder='passthrough'
)

In [None]:
# Remove 'selling_price' from numerical_features if present
numerical_features = [col for col in numerical_features if col in X.columns] # type: ignore

# Recreate the preprocessor with the correct features
preprocessor = ColumnTransformer(
	transformers=[
		('num', StandardScaler(), numerical_features),
		('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), onehot_features),
	],
	remainder='passthrough'
)

X = preprocessor.fit_transform(X)

In [25]:
# Splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((12195, 14), (3049, 14), (12195,), (3049,))

In [32]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.345188,-0.263995,1.156745,-0.457668,-0.282136,2.069705,0.0,0.0,1.0,0.0,0.0,0.0,1.0,42.0
1,3.301871,0.663782,-1.366539,-0.817005,-0.853041,-0.403299,1.0,0.0,0.0,0.0,0.0,1.0,1.0,100.0
2,1.312566,0.4706,0.202119,0.214888,0.069728,-0.403299,1.0,0.0,1.0,0.0,0.0,0.0,1.0,95.0
3,0.649465,-0.001306,-1.438495,0.599207,1.331546,-0.403299,0.0,0.0,0.0,0.0,0.0,1.0,0.0,87.0
4,-0.676739,1.050147,0.794563,-0.936144,-0.783134,-0.403299,1.0,0.0,0.0,0.0,0.0,1.0,1.0,64.0


MODEL TRAINING AND MODEL SELECTION

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [34]:
# creating a function to evaluate the model
def evaluate_model(true, pred):
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, pred)
    return mae, mse, rmse, r2

In [None]:
# Beginning with model training and selection
models = {
    'Random Forest': RandomForestRegressor(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Support Vector Regression': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    mae1, mse1, rmse1, r21 = evaluate_model(y_test, y_test_pred) # test set evaluation
    mae2, mse2, rmse2, r22 = evaluate_model(y_train, y_train_pred) # train set evaluation
    print(f"Model: {list(models.keys())[i]}")
    print('test set evaluation')
    print(f"mean_absolute_error : {mae1}\nmean_squared_error : {mse1}\nroot_mean_squared_error : {rmse1}\nr2_score : {r21}")
    print('-' * 50)
    print('train set evaluation')
    print(f"mean_absolute_error : {mae2}\nmean_squared_error : {mse2}\nroot_mean_squared_error : {rmse2}\nr2_score : {r22}")
    print("=" * 50)
    print("\n")

Model: Random Forest
test set evaluation
mean_absolute_error : 103626.62884999553
mean_squared_error : 95077451246.07443
root_mean_squared_error : 308346.3170626081
r2_score : 0.8436279686717381
--------------------------------------------------
train set evaluation
mean_absolute_error : 39730.512706904956
mean_squared_error : 15312181543.43192
root_mean_squared_error : 123742.39994210521
r2_score : 0.9819535978316094



Model: Linear Regression
test set evaluation
mean_absolute_error : 262717.18861080636
mean_squared_error : 199599130834.09274
root_mean_squared_error : 446765.1853424713
r2_score : 0.6717231990253706
--------------------------------------------------
train set evaluation
mean_absolute_error : 275284.38995814894
mean_squared_error : 320371249674.76685
root_mean_squared_error : 566013.4712838263
r2_score : 0.6224216387180521



Model: Ridge Regression
test set evaluation
mean_absolute_error : 262720.9742999098
mean_squared_error : 199594789954.5153
root_mean_squared_erro

In [38]:
# initializing the parameters for hyperparameter tuning
knn_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

In [41]:
# models list for hyperparameter tuning
models = [
    ('K-Nearest Neighbors',KNeighborsRegressor(), knn_params),
    ('Random Forest',RandomForestRegressor(), rf_params)
]

In [42]:
# Hyperparameter tuning using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
model_best_params = {}
for name, model, params in models:
    random_search = RandomizedSearchCV(model, params, cv=5, n_iter=10, n_jobs=-1, random_state=42, verbose=1)
    random_search.fit(X_train, y_train)
    best_params = random_search.best_params_
    model_best_params[name] = best_params
# Displaying the best parameters for each model
for name in model_best_params:
    print(f'------------ Best parameters for {name} model ------------')
    print(model_best_params[name])
    print("\n\n")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
------------ Best parameters for K-Nearest Neighbors model ------------
{'weights': 'distance', 'n_neighbors': 7, 'algorithm': 'ball_tree'}



------------ Best parameters for Random Forest model ------------
{'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}





In [43]:
# Retraining the best model with the best parameters
best_models = {
    'Random Forest': RandomForestRegressor(n_estimators=200, min_samples_split=2, max_features='log2', max_depth=20, bootstrap=False, n_jobs=-1),
    'K-Nearest Neighbors': KNeighborsRegressor(weights='distance', n_neighbors=7, algorithm='ball_tree', n_jobs=-1)
}

for i in range(len(list(best_models))):
    model = list(best_models.values())[i]
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    mae1, mse1, rmse1, r21 = evaluate_model(y_test, y_test_pred) # test set evaluation
    mae2, mse2, rmse2, r22 = evaluate_model(y_train, y_train_pred) # train set evaluation
    print(f"Model: {list(best_models.keys())[i]}")
    print('test set evaluation')
    print(f"mean_absolute_error : {mae1}\nmean_squared_error : {mse1}\nroot_mean_squared_error : {rmse1}\nr2_score : {r21}")
    print('-' * 50)
    print('train set evaluation')
    print(f"mean_absolute_error : {mae2}\nmean_squared_error : {mse2}\nroot_mean_squared_error : {rmse2}\nr2_score : {r22}")
    print("=" * 50)
    print("\n")

Model: Random Forest
test set evaluation
mean_absolute_error : 100288.37200006281
mean_squared_error : 52812056418.53569
root_mean_squared_error : 229808.73877756626
r2_score : 0.9131410399357948
--------------------------------------------------
train set evaluation
mean_absolute_error : 14931.922472082351
mean_squared_error : 720673102.7281806
root_mean_squared_error : 26845.35532877486
r2_score : 0.9991506398610227


Model: K-Nearest Neighbors
test set evaluation
mean_absolute_error : 98811.56742027224
mean_squared_error : 48483685110.03116
root_mean_squared_error : 220190.11129029197
r2_score : 0.9202598278816566
--------------------------------------------------
train set evaluation
mean_absolute_error : 4798.996856635234
mean_squared_error : 387378021.04687715
root_mean_squared_error : 19681.92117266191
r2_score : 0.9995434497991564


