In [534]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [535]:
# Loading data
car_data = pd.read_csv('car data.csv')

car_data

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


In [536]:
car_data.info()
car_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [537]:
# Encoding data
car_encoded = pd.get_dummies(car_data, columns=['Fuel_Type', 'Seller_Type', 'Transmission'], drop_first=True)

In [538]:
car_encoded.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Owner,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,ritz,2014,3.35,5.59,27000,0,False,True,False,True
1,sx4,2013,4.75,9.54,43000,0,True,False,False,True
2,ciaz,2017,7.25,9.85,6900,0,False,True,False,True
3,wagon r,2011,2.85,4.15,5200,0,False,True,False,True
4,swift,2014,4.6,6.87,42450,0,True,False,False,True


In [539]:
# Replacing True 1 and False 0
car_encoded = car_encoded.replace({True: 1, False: 0})


  car_encoded = car_encoded.replace({True: 1, False: 0})


In [540]:
car_encoded.describe().loc['std']

Year                          2.891554
Selling_Price                 5.082812
Present_Price                 8.644115
Kms_Driven                38886.883882
Owner                         0.247915
Fuel_Type_Diesel              0.400166
Fuel_Type_Petrol              0.405089
Seller_Type_Individual        0.478439
Transmission_Manual           0.340021
Name: std, dtype: float64

In [541]:
car_encoded.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Owner,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,ritz,2014,3.35,5.59,27000,0,0,1,0,1
1,sx4,2013,4.75,9.54,43000,0,1,0,0,1
2,ciaz,2017,7.25,9.85,6900,0,0,1,0,1
3,wagon r,2011,2.85,4.15,5200,0,0,1,0,1
4,swift,2014,4.6,6.87,42450,0,1,0,0,1


In [542]:
# Standarizing columns
scaler = StandardScaler()
features_for_standarize = ['Kms_Driven', 'Selling_Price', 'Present_Price', 'Year']
scaled_features = scaler.fit_transform(car_encoded[features_for_standarize])

In [543]:
scaled_df = pd.DataFrame(scaled_features, columns=features_for_standarize)

In [544]:
car_encoded[features_for_standarize] = scaled_df

In [545]:
car_encoded.describe().loc['std']

Year                      1.001665
Selling_Price             1.001665
Present_Price             1.001665
Kms_Driven                1.001665
Owner                     0.247915
Fuel_Type_Diesel          0.400166
Fuel_Type_Petrol          0.405089
Seller_Type_Individual    0.478439
Transmission_Manual       0.340021
Name: std, dtype: float64

In [546]:
# Dropping unimportant columns
car_encoded = car_encoded.drop(columns='Car_Name')

In [547]:
# Separating data into label and features
X = car_encoded.drop(columns='Selling_Price')
y = car_encoded['Selling_Price']

In [548]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [549]:
from sklearn.ensemble import GradientBoostingRegressor

In [550]:
# Initializing model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=2)

In [551]:
# Training model
model.fit(X_train, y_train)

In [552]:
# Predicting X_test
y_pred = model.predict(X_test)

In [553]:
# Evaluate the model's performance 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

scores = {
    'MAE' : mae, 
    'MSE' : mse, 
    'RMSE' : rmse, 
    'R2 Score' : r2
}

In [554]:
for metric, value in scores.items():
    print(f'{metric}: {value:.2f}')

MAE: 0.09
MSE: 0.02
RMSE: 0.15
R2 Score: 0.97
