## Random Forest Regression

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

In [41]:
df = pd.read_csv('cardekho_imputated.csv', index_col=[0])


In [42]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


# Data Cleaning
## Handling missing values

In [43]:
## Checking for null values
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [44]:
## Removing the unnecessary columns
df.drop(columns=['brand','car_name'],axis=1,inplace=True)

In [45]:
## train test split
from sklearn.model_selection import train_test_split
X=df.drop(['selling_price'],axis=1)
y=df['selling_price']

In [46]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


## Feature Encoding Scaling

In [47]:
from sklearn.preprocessing import StandardScaler, LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
le=LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [48]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [49]:
# Creating column transformer with 3 types of transformers
num_features=X.select_dtypes(exclude='object').columns
onehot_columns=['seller_type','fuel_type','transmission_type']
numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder(drop='first')
preprocessor = ColumnTransformer([
    ('OneHotEncoder',oh_transformer,onehot_columns),
    ('StandardScaler',numeric_transformer,num_features)
],remainder='passthrough')


In [50]:
X=preprocessor.fit_transform(X)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=10)

## Model Training and Model Selection

In [52]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [53]:

## Creating a function to evaluate model
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    r2_square=r2_score(true,predicted)
    return mae,mse,r2_square

In [57]:
## Model Training
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'K-Neighbour Regressor':KNeighborsRegressor(),
    'Decision Tree':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) #Training the Model
    
    # Making Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    ## Evaluating Train and Test Dataset
    model_train_mae, model_train_mse, model_train_r2_square = evaluate_model(y_train_pred,y_train)
    model_test_mae, model_test_mse, model_test_r2_square = evaluate_model(y_test_pred,y_test)

    print(list(models.keys())[i])

    print('Model Performance for training set')
    print('Mean Squared Error: {:.4f}'.format(model_train_mse))
    print('Mean_Absolute Error: {:.4f}'.format(model_train_mae))
    print('R2: {:.4f}'.format(model_train_r2_square))

    print('************************')

    print('Model Performance for testing set')
    print('Mean Squared Error: {:.4f}'.format(model_test_mse))
    print('Mean_Absolute Error: {:.4f}'.format(model_test_mae))
    print('R2: {:.4f}'.format(model_test_r2_square))

   # print(','*35)
    print('\n')

    






LinearRegression
Model Performance for training set
Mean Squared Error: 229251209296.3019
Mean_Absolute Error: 260031.9512
R2: 0.5214
************************
Model Performance for testing set
Mean Squared Error: 567633907510.6738
Mean_Absolute Error: 280915.9804
R2: -0.1340


Lasso
Model Performance for training set
Mean Squared Error: 229251214150.1726
Mean_Absolute Error: 260030.0543
R2: 0.5214
************************
Model Performance for testing set
Mean Squared Error: 567632802563.0322
Mean_Absolute Error: 280914.8656
R2: -0.1340


Ridge
Model Performance for training set
Mean Squared Error: 229251876065.9362
Mean_Absolute Error: 260001.4870
R2: 0.5213
************************
Model Performance for testing set
Mean Squared Error: 567653596621.1935
Mean_Absolute Error: 280894.8976
R2: -0.1343


K-Neighbour Regressor
Model Performance for training set
Mean Squared Error: 53889001336.1352
Mean_Absolute Error: 88358.6328
R2: 0.9113
************************
Model Performance for test