In [106]:
import pandas as pd

dataset = pd.read_csv("car data.csv")
print(dataset.shape)
dataset.head()

(301, 9)


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [135]:
dataset.drop_duplicates(inplace=True)
y = dataset['Present_Price']
X = dataset.drop('Present_Price', axis=1)
print(X.shape, y.shape)

(299, 8) (299,)


In [136]:
cat_col_names = []
num_col_names = []

for col in X.columns:
    if X[col].dtype == 'object':
        cat_col_names.append(col)
    else:
        num_col_names.append(col)

print(cat_col_names)
print(num_col_names)

['Car_Name', 'Fuel_Type', 'Selling_type', 'Transmission']
['Year', 'Selling_Price', 'Driven_kms', 'Owner']


In [137]:
for col in cat_col_names:
    print(len(X[col].unique()))

98
3
2
2


In [138]:
cat_col_names = cat_col_names[1:]
print(cat_col_names)

['Fuel_Type', 'Selling_type', 'Transmission']


In [139]:
cat_cols = X[cat_col_names]
num_cols = X[num_col_names]

In [140]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse = False)
cat_cols_OH = pd.DataFrame(encoder.fit_transform(cat_cols))
cat_cols_OH.head()


Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [141]:
cat_cols_OH.index = cat_cols.index
OH_col_names = encoder.get_feature_names_out(cat_col_names)
cat_cols_OH.columns = OH_col_names
cat_cols_OH.head()

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Dealer,Selling_type_Individual,Transmission_Automatic,Transmission_Manual
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [142]:
X_OH = pd.concat([num_cols, cat_cols_OH], axis=1)
X_OH.head()

Unnamed: 0,Year,Selling_Price,Driven_kms,Owner,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Dealer,Selling_type_Individual,Transmission_Automatic,Transmission_Manual
0,2014,3.35,27000,0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,2013,4.75,43000,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,2017,7.25,6900,0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,2011,2.85,5200,0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,2014,4.6,42450,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [143]:
from xgboost import XGBRegressor

model = XGBRegressor(random_state=7)
model.fit(X_OH,y)
predictions = model.predict(X_OH)


from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(predictions, y)
print(mae)

0.02727319561119458


In [144]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline


preprocessor = ColumnTransformer([
    ("cat_cols", OneHotEncoder(sparse=False, handle_unknown= 'ignore'), cat_col_names),
    ("num_cols", 'passthrough', num_col_names)
])

model = XGBRegressor(random_state=7)


pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', model)
])

pipe.fit(X,y)
pred = pipe.predict(X)
mae = mean_absolute_error(pred, y)

cv_scores = -cross_val_score(pipe, X, y, scoring='neg_mean_absolute_error', cv=5)
mae = cv_scores.mean()


print(mae)

1.6320915778564191


In [145]:

def test(model = XGBRegressor(random_state=7)):

    pipe = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', model)
    ])

    pipe.fit(X,y)
    pred = pipe.predict(X)
    mae = mean_absolute_error(pred, y)

    cv_scores = -cross_val_score(pipe, X, y, scoring='neg_mean_absolute_error', cv=5)
    mae = cv_scores.mean()

    return(mae)



In [146]:
learning_rates = [0.01, 0.05, 0.1, 0.5, 1]
nums_estimators = [10, 50, 100, 500, 1000]
results = []

for lr in learning_rates:
    for num in nums_estimators:
        res = test(XGBRegressor(learning_rate = lr, n_estimators = num, random_state=7))
        set = [lr, num, res]
        results.append(set)
        print(set)

[0.01, 10, 6.427630022609302]
[0.01, 50, 4.568808706464053]
[0.01, 100, 3.19247252974098]
[0.01, 500, 1.6441062259886894]
[0.01, 1000, 1.6379666842856646]
[0.05, 10, 4.522972872389562]
[0.05, 50, 1.8272774986642226]
[0.05, 100, 1.638149958855284]
[0.05, 500, 1.6516682606164341]
[0.05, 1000, 1.6553186400206072]
[0.1, 10, 3.1077222015764754]
[0.1, 50, 1.6458545217703289]
[0.1, 100, 1.6425568786842337]
[0.1, 500, 1.6728641214630695]
[0.1, 1000, 1.6728094499967192]
[0.5, 10, 1.7287719670771877]
[0.5, 50, 1.7346822195094522]
[0.5, 100, 1.7353795035820894]
[0.5, 500, 1.7353952245058313]
[0.5, 1000, 1.7353952245058313]
[1, 10, 2.013215060647598]
[1, 50, 2.010976291248758]
[1, 100, 2.0109762962158113]
[1, 500, 2.010976296811858]
[1, 1000, 2.010976296811858]


In [147]:
m = XGBRegressor(learning_rate = 0.1, n_estimators = 100, random_state=7)
mae = test(m)
print(mae)


1.6425568786842337


In [148]:
average_present_price = y.mean()
print(average_present_price)

relative_error = round(mae/average_present_price *100, 2)
print(relative_error, "%")

7.541036789297662
21.78 %
