<a href="https://colab.research.google.com/github/Anand-1932/Car_Price_Prediction/blob/main/car_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ydata_profiling

In [None]:
# importing basic library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from ydata_profiling import ProfileReport

In [None]:
# importing dataset
data=pd.read_csv("/content/CarPriceprediction.csv")

In [None]:
data

In [None]:
data.shape

# Basic Checks

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include='O')

# Insights from basic checks

*   this dataset contain 26 columns and 205 rows.
*   dataset contain no null values.
*   Average price of the car is 13276.710571
*   Most of the car is of toyota carona.
*    



# Exploratory Data Analysis(EDA)

In [None]:
data.columns

In [None]:
EDA=ProfileReport(data,title="EDA Report")
EDA

# Insights from EDA

*   Dataset has no missing value.

*   As carlength is increasing the price of the car is also increasing.

*   As carwidth is increasing the price of the car is also increasing.

*   Enginesize is directly proportional to the price of the car, as enginesize increasing price of car is also increasing.
*   Car having high boreratio have high price.


*   As horespower of car is increasing the price of the car is also increasing, more the horsepower more the price of car.


*   citympg is inversely proportional with carpeice, as citympg increasing price of the car is decreasing.



*   highwaympg is inversely proportional with carpeice, as highwaympg increasing price of the car is decreasing.

*   Citympg and highwaympg is highly coorelated. we will drop one of the column in preprocessing.
*   



# Data Preprocessing

In [None]:
data.value_counts()

In [None]:
# finding missing value
data.isnull().sum()

In [None]:
# Since dataset has no missing value there is no need to handle missing value.

In [None]:
# plotting boxplot to find outliers.
plt.figure(figsize=(10,10))
plotnumber=1

for i in data.select_dtypes(include='number').columns:
  if plotnumber<=16:

    ax=plt.subplot(6,3,plotnumber)
    sns.boxplot(x=data[i])
    plt.xlabel(i,fontsize=10)
    plt.ylabel('count',fontsize=10)

    plotnumber+=1

plt.tight_layout()  #to fit the graph properly

In [None]:
# checking for duplicated value
data.duplicated().unique()

In [None]:
# since there is no null value so skipping the handle missing value step.
# since there is no duplicated value so skipping handle duplicated value step.

In [None]:
# Treating outliers

In [None]:
def wisker(col):
  Q1,Q3=np.percentile(col,(25,75))
  IQR=Q3-Q1
  lw=Q1-(1.5*IQR)
  uw=Q3+(1.5*IQR)
  return lw,uw

In [None]:
for i in ['wheelbase','carlength','carwidth','enginesize','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg']:
  lw,uw=wisker(data[i])
  data[i]=np.where(data[i]<lw,lw,data[i])
  data[i]=np.where(data[i]>uw,uw,data[i])

In [None]:
# Again plotting boxplot to check weather the outliers are handled or not
plt.figure(figsize=(10,10))
plotnumber=1

for i in data.select_dtypes(include='number').columns:
  if plotnumber<=16:

    ax=plt.subplot(6,3,plotnumber)
    sns.boxplot(x=data[i])
    plt.xlabel(i,fontsize=10)
    plt.ylabel('count',fontsize=10)

    plotnumber+=1

plt.tight_layout()  #to fit the graph properly

In [None]:
# outliers are handled using IQR
# value lower than lower limit is replaced by lower limit(Q1-1.5*IQR)
# value higher than upper limit is replaced by lower limit(Q3+1.5*IQR)

In [None]:
# checking coorelation between the numerical independent feature
new_data=data.select_dtypes(include='number').corr()
new_data


In [None]:
# checking coorelation of data
plt.figure(figsize=(15,15))
sns.heatmap(new_data,annot=True)

# Feature Engineering

In [None]:
# feature encoding
# converting categorical column to numerical column

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder

In [None]:
label=LabelEncoder()
OHE=OneHotEncoder()
OE=OrdinalEncoder()

In [None]:
data.select_dtypes(include='object').columns

In [None]:
# All the categorical data has to maintain hierarchy so we will apply ordinal
# encoding on all the categorical column.

In [None]:
# since the name of the car has effect on the price of the car so we will apply
# Ordinal encoding

In [None]:
column_to_transform=['CarName','fueltype','aspiration','doornumber','carbody','drivewheel',
     'enginelocation','enginetype','cylindernumber','fuelsystem']

In [None]:
data[column_to_transform]=OE.fit_transform(data[column_to_transform])

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
data[['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']]=scaler.fit_transform(data[['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']])

In [None]:
data

# Feature Selection

In [None]:
# drop car_ID because it has unique value
data.drop('car_ID',axis=1,inplace=True)

In [None]:
# Removing below mention input column since it shows high coorelation between
# another input column
remove_col=['wheelbase','carwidth','carlength','highwaympg']

In [None]:
data.drop(remove_col,axis=1,inplace=True)


In [None]:
final_data=data

In [None]:
final_data

# Model Creation

In [None]:
# seprating independent and dependent variable
x=data.drop('price',axis=1)
y=data['price']

In [None]:
# creating testing and training dataset
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)

In [None]:
x_test.shape

# Linear Regression

In [None]:
# creating linear regression model
from sklearn.linear_model import LinearRegression
LR=LinearRegression()
LR.fit(x_train,y_train)
y_pred=LR.predict(x_test)

In [None]:
y_pred

In [None]:
# Model Evalution
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error

In [None]:
r2=r2_score(y_test,y_pred)
r2

In [None]:
mse=mean_squared_error(y_test,y_pred)
mse

In [None]:
mae=mean_absolute_error(y_test,y_pred)
mae

In [None]:
rmse=np.sqrt(mse)
rmse

# Support Vector Machine

In [None]:
from sklearn.svm import SVR
svr=SVR()

In [None]:
svr.fit(x_train,y_train)
y_pred=svr.predict(x_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
# Model Evaluation
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_log_error
r2=r2_score(y_test,y_pred)
r2

In [None]:
mse=mean_squared_error(y_test,y_pred)
mse

# support vector machine regression model is not for predicting car price prediction since the r2_score is very low.

# Model Creation Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt=DecisionTreeRegressor(criterion='friedman_mse',max_depth=10,min_samples_leaf=1,min_samples_split=3,splitter='random')
dt.fit(x_train,y_train)
y_hat=dt.predict(x_test)
y_hat

In [None]:
# Evaluating the model
r2=r2_score(y_test,y_hat)
r2

In [None]:
mse=mean_squared_error(y_test,y_hat)
mse

In [None]:
# Applying hypermeter tunning
from sklearn.model_selection import GridSearchCV

In [None]:
params={
    "criterion":("mse","mae","friedman_mse","poission"),
    "splitter":("best","random"),
    "max_depth":(list(range(1,20))),
    "min_samples_split":[2,3,4],
    "min_samples_leaf":list(range(1,20))
}

tree_rgr=DecisionTreeRegressor()
tree_cv=GridSearchCV(tree_rgr,params,scoring="r2",n_jobs=-1,cv=5)
tree_cv.fit(x_train,y_train)

best_params=tree_cv.best_params_
print(f"Best parameters:, {best_params})")

In [None]:
tree_cv.best_score_

In [None]:
dt1=DecisionTreeRegressor(criterion='friedman_mse',max_depth=10,min_samples_leaf=3,min_samples_split=4,splitter='random')

In [None]:
dt1.fit(x_train,y_train)
y_pred_dt=dt1.predict(x_test)
y_pred_dt

In [None]:
# Evaluating the model
r2=r2_score(y_test,y_pred_dt)
r2

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_rgr=RandomForestRegressor(n_estimators=100)
rf_rgr.fit(x_train,y_train)

In [None]:
y_pred_rf=rf_rgr.predict(x_test)
y_pred_rf

In [None]:
# Evaluating the model
r2=r2_score(y_test,y_pred_rf)
r2

In [None]:
# hyperparameter tunning
from sklearn.model_selection import RandomizedSearchCV

n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
max_features=['auto','sqrt','log2']
max_depth=[int(x) for x in np.linspace(10,110,num=11)]
min_samples_split=[2,5,18]
min_samples_leaf=[1,2,4]

random_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf
}

rf_rgr1=RandomForestRegressor(random_state=3)

rf_cv=RandomizedSearchCV(estimator=rf_rgr1,scoring='r2',param_distributions=random_grid,n_iter=100,cv=4,
                         verbose=2,random_state=3,n_jobs=-1)

rf_cv.fit(x_train,y_train)
rf_best_params=rf_cv.best_params_
print(f"Best parameters:, {rf_best_params})")

In [None]:
rf_rgr2=RandomForestRegressor(n_estimators=1400,min_samples_split=2,min_samples_leaf=1,max_features='sqrt',max_depth=80)

In [None]:
rf_rgr2.fit(x_train,y_train)

In [None]:
y_pred_rf1=rf_rgr2.predict(x_test)
y_pred_rf1

In [None]:
# Evaluating the model
r2=r2_score(y_test,y_pred_rf1)
r2

# After Applying linear regression, support vector machine (regression), Decision Tree (regression) , Random Forest(regression).  Random Forest Regressor have best r2 score

In [None]:
import pickle

In [None]:
pickle.dump(rf_rgr2, open('model.pkl','wb'))

In [None]:
x_train.columns

In [None]:
x_train.head(1)

In [None]:
rf_rgr2.predict([[0.93849,	0.605905,	0.328798,	-0.469295,	-0.884652,	0.449677,	-0.589081,	-0.121867,	0.974387,	0.38974,	-0.013908,	-0.147475,	-0.105358,	0.869568,	0.778156,	-0.694495,	0.329633,	0.201626,	0.2643,	-0.649321]])

In [None]:
y_train.head(1)