In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [3]:
dataset = pd.read_csv("car data.csv")
dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
dataset.isna().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

In [5]:
dataset.duplicated().sum()

2

In [6]:
dataset.drop_duplicates(inplace=True)
dataset.duplicated().sum()

0

In [27]:
y = dataset['Present_Price']
print(y.shape)
y.head()

(299,)


0    5.59
1    9.54
2    9.85
3    4.15
4    6.87
Name: Present_Price, dtype: float64

In [26]:
X = dataset[["Year", "Selling_Price", "Driven_kms", "Owner"]]
print(X.shape)
X.head()

(299, 4)


Unnamed: 0,Year,Selling_Price,Driven_kms,Owner
0,2014,3.35,27000,0
1,2013,4.75,43000,0
2,2017,7.25,6900,0
3,2011,2.85,5200,0
4,2014,4.6,42450,0


In [33]:
X_train, X_test, y_train,  y_test = train_test_split(X, y, random_state=7)

In [34]:
default_DT = DecisionTreeRegressor(random_state=7)
default_DT.fit(X_train, y_train)
mae = mean_absolute_error(y_test, default_DT.predict(X_test))
print(mae)

1.6693333333333331


In [35]:
average_present_price = y.mean()
print(average_present_price)

relative_error = round(mae/average_present_price *100, 2)
print(relative_error, "%")

7.541036789297662
22.14 %


In [37]:
max_leaf_sizes = [5, 10, 50, 100, 500]
maes = []

for leaf_size in max_leaf_sizes:
    DT = DecisionTreeRegressor(random_state=7, max_leaf_nodes=leaf_size)
    DT.fit(X_train, y_train)
    mae = mean_absolute_error(y_test, DT.predict(X_test))
    print(f'Max Leafs: {leaf_size} | MAE: {mae}')

    maes.append(mae)

best_leaf_size = max_leaf_sizes[np.argmin(maes)]
best_mae = np.min(maes)
print()
print(f'Best Leaf Size: {best_leaf_size} | Best MAE: {best_mae}')



Max Leafs: 5 | MAE: 2.3890808214008215
Max Leafs: 10 | MAE: 2.5523355559075074
Max Leafs: 50 | MAE: 1.6066773766058149
Max Leafs: 100 | MAE: 1.6366038095238098
Max Leafs: 500 | MAE: 1.6527999999999998

Best Leaf Size: 50 | Best MAE: 1.6066773766058149


In [38]:
relative_error = round(best_mae/average_present_price *100, 2)
print(relative_error, "%")

21.31 %


In [40]:
RF = RandomForestRegressor(random_state=7)
RF.fit(X_train, y_train)
mae = mae = mean_absolute_error(y_test, RF.predict(X_test))
print(mae)

1.217540000000001


In [42]:
relative_error = round(mae/average_present_price *100, 2)
print(relative_error, "%")

16.15 %
