# Car Price Prediction

## Data Loading & Preprocessing

https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho/data?select=car+data.csv

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

%matplotlib inline
pd.set_option("display.max_rows", None,"display.max_columns", None)
warnings.simplefilter(action='ignore')
plt.style.use('seaborn')

In [None]:
# Load data

dataset = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')
dataset.head()

In [None]:
dataset.info()

### Feature Extraction 

Two features `Year` and `Car_Name` were modified to `Driving_Age` that denotes how many years this vehicle is been used. And `Brand` represents the individual brands the vehicle is from.

In [None]:
dataset['Driving_Age'] = 2023 - dataset['Year']
dataset.drop('Year',axis=1,inplace = True)

In [None]:
def extract_brand(car_name):
    # Split the car name by space and take the first part as the brand
    return car_name.split()[0]

# Apply the function to create a new 'Brand' column
dataset['Brand'] = dataset['Car_Name'].apply(extract_brand)

dataset.drop('Car_Name', axis = 1, inplace = True)


In [None]:
# Renaming columns

dataset.rename(columns = {'Selling_Price':'Selling_Price(lacs)','Present_Price':'Present_Price(lacs)','Owner':'Past_Owners'},inplace = True)

In [None]:
dataset.head()

### Exploratory Data Analysis (EDA)

Here we have visualized the data with three types of plots `box`, `count`, `distribution`. From the box plots we can see the outliers. But removing the outliers doesn't perform well in the evaluation.

#### <b> Univariate Analysis </b>

#### Count Plot

In [None]:
cat_cols = ['Fuel_Type', 'Seller_Type', 'Transmission', 'Past_Owners', 'Brand']

fig, axes = plt.subplots(3, 2, figsize=(10, 4))

# Flatten the axes array to iterate over it easily
axes = axes.flatten()

# Iterate over the categorical columns and create count plots
for i in range(len(cat_cols)):
    sns.countplot(x=cat_cols[i], data=dataset, ax=axes[i])

plt.tight_layout()
plt.show()


#### Box Plot

In [None]:
num_cols = ['Selling_Price(lacs)','Present_Price(lacs)','Kms_Driven','Driving_Age']

fig, axes = plt.subplots(2, 2, figsize=(13, 3))

# Flatten the axes array to iterate over it easily
axes = axes.flatten()

# Iterate over the categorical columns and create count plots
for i in range(len(cat_cols)):
    if i < len(axes):
        sns.boxplot(x=num_cols[i], data=dataset, ax=axes[i])

plt.tight_layout()
plt.show()


#### Distribution Plot

In [None]:
# Distribution plot

num_cols = ['Selling_Price(lacs)', 'Present_Price(lacs)', 'Kms_Driven', 'Driving_Age']

fig, axes = plt.subplots(2, 2, figsize=(10, 6))

# Flatten the axes array to iterate over it easily
axes = axes.flatten()

# Iterate over the numerical columns and create distribution plots
for i in range(len(num_cols)):
    sns.histplot(dataset[num_cols[i]], ax=axes[i], kde=True)  # Use sns.histplot for distribution plots

plt.tight_layout()
plt.show()


### Label Encoding

All the catagorical features are converted to numaric features

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
cat_cols = ['Fuel_Type', 'Seller_Type', 'Transmission', 'Past_Owners', 'Brand']
for column in cat_cols:
    dataset[column] = encoder.fit_transform(dataset[column])

In [None]:
dataset.info()

#### <b> Bibariate/Multi-Variate Analysis </b>

### Correlation

In [None]:
corr = dataset.corr()
plt.figure(figsize = (10,10))
sns.heatmap(corr, cmap = 'Blues', annot = True, square = True, fmt = '.2f')
plt.show()

In [None]:
corr['Selling_Price(lacs)']

In [None]:
# Data split to features and Label
X = dataset.drop('Selling_Price(lacs)', axis = 1)
y = dataset['Selling_Price(lacs)']

### Feature Transformation


In [None]:
# Distribution plot

num_cols = ['Present_Price(lacs)', 'Kms_Driven', 'Driving_Age']

fig, axes = plt.subplots(2, 2, figsize=(15, 5))

# Flatten the axes array to iterate over it easily
axes = axes.flatten()

# Iterate over the numerical columns and create distribution plots
for i in range(len(num_cols)):
    sns.histplot(X[num_cols[i]], ax=axes[i], kde=True)  # Use sns.histplot for distribution plots

plt.tight_layout()
plt.show()


#### Feature and Label Split

In [None]:
# Select only continous data

col_num = ['Present_Price(lacs)', 'Kms_Driven', 'Driving_Age']
X_num = X[col_num]
X_num.shape

#### Z-score using StandardScaler

In [None]:
# Normalization z-score

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_norm = scaler.fit_transform(X_num)

column_names = ['Present_Price(lacs)', 'Kms_Driven', 'Driving_Age']
X_nnorm = pd.DataFrame(X_norm, columns=column_names)

#### Log transformation

In [None]:
# Applying a natural logarithm (base e) transformation

X_num['Driving_Age'] = np.log(X_num['Driving_Age'])
X_num['Present_Price(lacs)'] = np.log(X_num['Present_Price(lacs)'])
X_num['Kms_Driven'] = np.log(X_num['Kms_Driven'])

In [None]:
# Distribution after transformation

num_cols = ['Present_Price(lacs)', 'Kms_Driven', 'Driving_Age']

fig, axes = plt.subplots(2, 2, figsize=(15, 5))

# Flatten the axes array to iterate over it easily
axes = axes.flatten()

# Iterate over the numerical columns and create distribution plots
for i in range(len(num_cols)):
    sns.histplot(X_nnorm[num_cols[i]], ax=axes[i], kde=True)  # Use sns.histplot for distribution plots

plt.tight_layout()
plt.show()


In [None]:
# X modifiy
X = X.drop(columns = ['Driving_Age', 'Present_Price(lacs)', 'Kms_Driven'], axis = 1)
X.head()

In [None]:
X_num.head()

#### Concat Catagorical and transformed Numarical features

In [None]:
X = pd.concat([X, X_nnorm], axis = 1)
X.head()

### Feature Selection 

Using `ExtraTreesRegressor` we can find out the importance of the features and based on that we can do the prediction. Using 6 most important feature from ETR we get 1% increase in evaluation from 84 to 85.

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

model = ExtraTreesRegressor()
model.fit(X, y)


In [None]:
feature_importances= pd.Series(model.feature_importances_, index = X.columns)
feature_importances.nlargest(8).plot(kind = 'barh')
plt.show()

#### Selected Features

In [None]:
columns = ['Present_Price(lacs)', 'Seller_Type', 'Fuel_Type', 'Driving_Age', 'Transmission', 'Brand']

In [None]:
# Select The most important 6 featrues
X = X[columns]
X.head()

In [None]:
X.shape

## Model Creation/Evaluation

#### Applying regression models

- Linear Regression
- Ridge Regression
- Lasso Regression
- Random Forest Regression
- Gradient Boosting regression

In [None]:
# Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train.shape, X_test.shape)

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [None]:
CV = []
R2_train = []
R2_test = []

def car_pred_model(model,model_name):
    # Training model
    model.fit(X_train,y_train)
            
    # R2 score of train set
    y_pred_train = model.predict(X_train)
    R2_train_model = r2_score(y_train,y_pred_train)
    R2_train.append(round(R2_train_model,2))
    
    # R2 score of test set
    y_pred_test = model.predict(X_test)
    R2_test_model = r2_score(y_test,y_pred_test)
    R2_test.append(round(R2_test_model,2))
    
    # R2 mean of train set using Cross validation
    cross_val = cross_val_score(model ,X_train ,y_train ,cv=5)
    cv_mean = cross_val.mean()
    CV.append(round(cv_mean,2))
    
    # Printing results
    print("Train R2-score :",round(R2_train_model,2))
    print("Test R2-score :",round(R2_test_model,2))
    print("Train CV scores :",cross_val)
    print("Train CV mean :",round(cv_mean,2))
    
    # Plotting Graphs 
    # Residual Plot of train data
    fig, ax = plt.subplots(1,2,figsize = (10,4))
    ax[0].set_title('Residual Plot of Train samples')
    sns.distplot((y_train-y_pred_train),hist = False,ax = ax[0])
    ax[0].set_xlabel('y_train - y_pred_train')
    
    # Y_test vs Y_train scatter plot
    ax[1].set_title('y_test vs y_pred_test')
    ax[1].scatter(x = y_test, y = y_pred_test)
    ax[1].set_xlabel('y_test')
    ax[1].set_ylabel('y_pred_test')
    
    plt.show()

#### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
car_pred_model(lr,"Linear_regressor.pkl")

#### Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

# Creating Ridge model object
rg = Ridge()
# range of alpha 
alpha = np.logspace(-3,3,num=14)

# Creating RandomizedSearchCV to find the best estimator of hyperparameter
rg_rs = RandomizedSearchCV(estimator = rg, param_distributions = dict(alpha=alpha))

car_pred_model(rg_rs,"ridge.pkl")

#### Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV

ls = Lasso()
alpha = np.logspace(-3,3,num=14) # range for alpha

ls_rs = RandomizedSearchCV(estimator = ls, param_distributions = dict(alpha=alpha))
car_pred_model(ls_rs,"lasso.pkl")

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

# Number of trees in Random forest
n_estimators=list(range(500,1000,100))
# Maximum number of levels in a tree
max_depth=list(range(4,9,4))
# Minimum number of samples required to split an internal node
min_samples_split=list(range(4,9,2))
# Minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,7]
# Number of fearures to be considered at each split
max_features=['auto','sqrt']

# Hyperparameters dict
param_grid = {"n_estimators":n_estimators,
              "max_depth":max_depth,
              "min_samples_split":min_samples_split,
              "min_samples_leaf":min_samples_leaf,
              "max_features":max_features}

rf_rs = RandomizedSearchCV(estimator = rf, param_distributions = param_grid)

In [None]:
car_pred_model(rf_rs,'random_forest.pkl')

In [None]:
print(rf_rs.best_estimator_)

#### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

gb = GradientBoostingRegressor()

# Rate at which correcting is being made
learning_rate = [0.001, 0.01, 0.1, 0.2]
# Number of trees in Gradient boosting
n_estimators=list(range(500,1000,100))
# Maximum number of levels in a tree
max_depth=list(range(4,9,4))
# Minimum number of samples required to split an internal node
min_samples_split=list(range(4,9,2))
# Minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,7]
# Number of fearures to be considered at each split
max_features=['auto','sqrt']

# Hyperparameters dict
param_grid = {"learning_rate":learning_rate,
              "n_estimators":n_estimators,
              "max_depth":max_depth,
              "min_samples_split":min_samples_split,
              "min_samples_leaf":min_samples_leaf,
              "max_features":max_features}

gb_rs = RandomizedSearchCV(estimator = gb, param_distributions = param_grid)

In [None]:
car_pred_model(gb_rs,"gradient_boosting.pkl")

In [None]:
Technique = ["LinearRegression","Ridge","Lasso","RandomForestRegressor","GradientBoostingRegressor"]
results=pd.DataFrame({'Model': Technique,'R Squared(Train)': R2_train,'R Squared(Test)': R2_test,'CV score mean(Train)': CV})
display(results)

In [None]:
# 0.8505845915709578 Z-score transformation
# 0.7597076694902594 Log transformation