# **Preprocess The Dataset**

### **Importing Libraries**

In [1]:
!pip install lightgbm -q

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score

# Regression Models
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor


import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../Data/preprocessing.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Model,Engine_type,Transmission,Km_Driven,prices,Power,Unit,Company,Model_name
0,0,2019,Petrol,Automatic,26755,72.5,1500.0,cc,Honda,Vezel
1,1,2020,Hybrid,Automatic,9744,32.5,660.0,cc,Suzuki,Wagon R
2,2,2008,Petrol,Automatic,85000,29.8,1500.0,cc,Toyota,Corolla Axio
3,3,2018,Petrol,Manual,40000,33.25,1300.0,cc,Toyota,Corolla
4,4,2020,Petrol,Automatic,23300,35.75,660.0,cc,Daihatsu,Mira


In [4]:
df.drop(columns='Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,Model,Engine_type,Transmission,Km_Driven,prices,Power,Unit,Company,Model_name
0,2019,Petrol,Automatic,26755,72.5,1500.0,cc,Honda,Vezel
1,2020,Hybrid,Automatic,9744,32.5,660.0,cc,Suzuki,Wagon R
2,2008,Petrol,Automatic,85000,29.8,1500.0,cc,Toyota,Corolla Axio
3,2018,Petrol,Manual,40000,33.25,1300.0,cc,Toyota,Corolla
4,2020,Petrol,Automatic,23300,35.75,660.0,cc,Daihatsu,Mira


In [6]:
df_sample = df.copy()

In [7]:
df.shape

(60553, 9)

In [8]:
df.duplicated().sum()

8310

In [9]:
60553 - 8310

52243

In [10]:
df[df.duplicated()].head(20)

Unnamed: 0,Model,Engine_type,Transmission,Km_Driven,prices,Power,Unit,Company,Model_name
40,2021,Petrol,Automatic,22989,73.0,1800.0,cc,Honda,Civic
41,2006,Petrol,Automatic,41000,35.75,1600.0,cc,Honda,Civic Eagle
48,2017,Petrol,Manual,33000,32.65,1300.0,cc,Honda,City
50,2017,Petrol,Manual,79000,26.0,1000.0,cc,Suzuki,Cultus
75,2015,Hybrid,Automatic,72000,40.75,1500.0,cc,Toyota,Aqua
76,2020,Petrol,Automatic,4500,37.5,660.0,cc,Honda,N One
78,2020,Petrol,Automatic,28000,40.0,996.0,cc,Toyota,Vitz
80,2021,Petrol,Automatic,48500,65.75,1500.0,cc,MG,HS
87,2017,Petrol,Automatic,76692,26.7,660.0,cc,Honda,N Wgn
94,2017,Petrol,Automatic,86412,50.0,1800.0,cc,Honda,Civic


In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.shape

(52243, 9)

In [13]:
df.isnull().sum()

Model             0
Engine_type       0
Transmission      0
Km_Driven         0
prices          512
Power             0
Unit              0
Company           0
Model_name        0
dtype: int64

In [14]:
df.dropna(inplace=True)

In [15]:
df.isnull().sum()

Model           0
Engine_type     0
Transmission    0
Km_Driven       0
prices          0
Power           0
Unit            0
Company         0
Model_name      0
dtype: int64

In [16]:
df.shape

(51731, 9)

In [17]:
df.columns

Index(['Model', 'Engine_type', 'Transmission', 'Km_Driven', 'prices', 'Power',
       'Unit', 'Company', 'Model_name'],
      dtype='object')

In [18]:
## Detecting the Outlier

Q1 = df['Power'].quantile(0.25)
Q3 = df['Power'].quantile(0.75)

IRQ = Q3 - Q1

lower_bound = Q1 - 1.5 * IRQ
upper_bound = Q3 + 1.5 * IRQ

outlier_power = df[(df['Power'] < lower_bound) | (df['Power'] > upper_bound)]
outlier_power

Unnamed: 0,Model,Engine_type,Transmission,Km_Driven,prices,Power,Unit,Company,Model_name
13,2013,Petrol,Automatic,52000,2.25,2700.0,cc,Toyota,Prado
14,2014,Petrol,Automatic,86500,73.00,2700.0,cc,Toyota,Fortuner
39,2022,Electric,Automatic,6500,51.00,20.0,kWh,Mitsubishi,EK X
58,2013,Petrol,Automatic,134000,3.10,4600.0,cc,Toyota,Land Cruiser
62,2002,Petrol,Automatic,180000,29.50,3000.0,cc,Toyota,Estima
...,...,...,...,...,...,...,...,...,...
60474,1996,Petrol,Automatic,99376,35.00,2700.0,cc,Toyota,Surf
60477,2005,Diesel,Automatic,200000,85.00,3000.0,cc,Toyota,Prado
60497,2018,Petrol,Automatic,60000,2.32,2700.0,cc,Toyota,Prado
60498,1991,Diesel,Automatic,160000,48.00,4164.0,cc,Toyota,Land Cruiser


In [19]:
df = df[(df['Power'] >= lower_bound) & (df['Power'] <= upper_bound)]

In [20]:
## Detecting the Outlier

Q1 = df['Km_Driven'].quantile(0.25)
Q3 = df['Km_Driven'].quantile(0.75)

IRQ = Q3 - Q1

lower_bound = Q1 - 1.5 * IRQ
upper_bound = Q3 + 1.5 * IRQ

outlier_Km_Driven = df[(df['Km_Driven'] < lower_bound) | (df['Km_Driven'] > upper_bound)]
outlier_Km_Driven

Unnamed: 0,Model,Engine_type,Transmission,Km_Driven,prices,Power,Unit,Company,Model_name
26,2002,Petrol,Manual,300000,12.25,1500.0,cc,Honda,Civic
126,2004,Petrol,Automatic,259840,6.80,1000.0,cc,Suzuki,Alto
176,2008,Petrol,Automatic,262933,20.00,1800.0,cc,Honda,Civic Reborn
505,2001,Petrol,Manual,295000,11.50,1600.0,cc,Suzuki,Baleno
517,2017,Petrol,Manual,881587,12.80,796.0,cc,Suzuki,Mehran
...,...,...,...,...,...,...,...,...,...
60384,2010,Petrol,Manual,300000,9.50,1000.0,cc,Suzuki,Alto
60525,2004,Petrol,Manual,276994,14.00,1300.0,cc,Honda,City
60530,2007,Petrol,Automatic,300000,17.75,1500.0,cc,Nissan,AD Van
60532,2005,Diesel,Manual,452000,24.00,2000.0,cc,Toyota,Corolla 9th


In [11]:
df["Model"].min()

1980

In [13]:
df["Power"].min()

0.0

In [12]:
df["Km_Driven"].min()

1

In [8]:
df["Model_name"].value_counts()

Model_name
Corolla                7610
Civic                  3995
Mehran                 3069
Alto                   2903
Cultus                 2653
                       ... 
Rover Autobiography       1
Charger                   1
Celerio                   1
Clipper 11th              1
Rover Series              1
Name: count, Length: 518, dtype: int64

In [21]:
df[(df['Km_Driven'] >= lower_bound) & (df['Km_Driven'] <= upper_bound)]

Unnamed: 0,Model,Engine_type,Transmission,Km_Driven,prices,Power,Unit,Company,Model_name
0,2019,Petrol,Automatic,26755,72.50,1500.0,cc,Honda,Vezel
1,2020,Hybrid,Automatic,9744,32.50,660.0,cc,Suzuki,Wagon R
2,2008,Petrol,Automatic,85000,29.80,1500.0,cc,Toyota,Corolla Axio
3,2018,Petrol,Manual,40000,33.25,1300.0,cc,Toyota,Corolla
4,2020,Petrol,Automatic,23300,35.75,660.0,cc,Daihatsu,Mira
...,...,...,...,...,...,...,...,...,...
60545,1984,Petrol,Manual,190000,2.75,1000.0,cc,Nissan,Pulsar
60546,1986,Petrol,Manual,10000,4.00,1000.0,cc,Suzuki,Khyber
60548,2022,Petrol,Automatic,1400,65.00,1500.0,cc,Honda,City
60549,2002,Petrol,Automatic,94000,10.50,1500.0,cc,Honda,City


In [22]:
df = df[(df['Km_Driven'] >= lower_bound) & (df['Km_Driven'] <= upper_bound)]

In [23]:
df

Unnamed: 0,Model,Engine_type,Transmission,Km_Driven,prices,Power,Unit,Company,Model_name
0,2019,Petrol,Automatic,26755,72.50,1500.0,cc,Honda,Vezel
1,2020,Hybrid,Automatic,9744,32.50,660.0,cc,Suzuki,Wagon R
2,2008,Petrol,Automatic,85000,29.80,1500.0,cc,Toyota,Corolla Axio
3,2018,Petrol,Manual,40000,33.25,1300.0,cc,Toyota,Corolla
4,2020,Petrol,Automatic,23300,35.75,660.0,cc,Daihatsu,Mira
...,...,...,...,...,...,...,...,...,...
60545,1984,Petrol,Manual,190000,2.75,1000.0,cc,Nissan,Pulsar
60546,1986,Petrol,Manual,10000,4.00,1000.0,cc,Suzuki,Khyber
60548,2022,Petrol,Automatic,1400,65.00,1500.0,cc,Honda,City
60549,2002,Petrol,Automatic,94000,10.50,1500.0,cc,Honda,City


In [24]:
def draw_boxplot(data, column):
    fig = px.box(data, x=column, title=f'Box Plot of {column}')
    fig.show()

In [26]:
!pip install nbformat>=4.2.0

In [27]:
draw_boxplot(df, 'Km_Driven')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [28]:
df.columns

Index(['Model', 'Engine_type', 'Transmission', 'Km_Driven', 'prices', 'Power',
       'Unit', 'Company', 'Model_name'],
      dtype='object')

In [38]:
draw_boxplot(df, 'Power')

In [39]:
draw_boxplot(df, 'prices')

In [29]:
df.columns

Index(['Model', 'Engine_type', 'Transmission', 'Km_Driven', 'prices', 'Power',
       'Unit', 'Company', 'Model_name'],
      dtype='object')

In [30]:
X = df.drop('prices', axis=1)
y = df['prices']

In [31]:
X.columns

Index(['Model', 'Engine_type', 'Transmission', 'Km_Driven', 'Power', 'Unit',
       'Company', 'Model_name'],
      dtype='object')

In [32]:
numerical_cols = ['Km_Driven', 'Power']
categorical_cols = ['Engine_type', 'Transmission', 'Unit', 'Company', 'Model_name']

In [33]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

In [34]:
numerical_cols

Index(['Model', 'Km_Driven', 'Power'], dtype='object')

In [35]:
categorical_cols

Index(['Engine_type', 'Transmission', 'Unit', 'Company', 'Model_name'], dtype='object')

In [36]:
transformations = ColumnTransformer(transformers=[
    ('standardize', StandardScaler(), numerical_cols),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True, drop='first'), categorical_cols)
], remainder='passthrough')

In [37]:
transformations

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
pipeline = Pipeline(steps=[
    ('preprocessor', transformations),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [40]:
# Train the model with the pipeline
pipeline.fit(X_train, y_train)

In [41]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

In [42]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# Output the metrics
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")

Mean Squared Error: 42.08371368136197
Root Mean Squared Error: 6.48719613402909
R² Score: 0.9006460292138315


In [44]:
import pickle
with open('car_price_model', 'wb') as file:
    pickle.dump(pipeline , file)


In [69]:
pipeline = Pipeline(steps=[
    ('preprocessor', transformations),
    ('regressor', LGBMRegressor(random_state=42))
])

In [70]:
# Train the model with the pipeline
pipeline.fit(X_train, y_train)

In [71]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

In [72]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# Output the metrics
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")

Mean Squared Error: 44.45925071029012
Root Mean Squared Error: 6.667777044134733
R² Score: 0.8950377067553952


----

### I am using `RandomForestRegressor` as my final model

In [75]:
pipeline = Pipeline(steps=[
    ('preprocessor', transformations),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [76]:
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 500],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False]
}

In [None]:
# Define GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Choose your scoring metric
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,  # Use all available cores
    verbose=2
)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


KeyboardInterrupt: 

In [None]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")