### Import required libraries


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

### Load data
https://www.kaggle.com/datasets/aishwaryamuthukumar/cars-dataset-audi-bmw-ford-hyundai-skoda-vw

In [2]:
df = pd.read_csv("audi.csv")

display(df.head(10))
print(df.shape)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,49.6,1.0
5,A1,2016,13900,Automatic,32260,Petrol,58.9,1.4
6,A6,2016,13250,Automatic,76788,Diesel,61.4,2.0
7,A4,2016,11750,Manual,75185,Diesel,70.6,2.0
8,A3,2015,10200,Manual,46112,Petrol,60.1,1.4
9,A1,2016,12000,Manual,22451,Petrol,55.4,1.4


(10668, 8)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10668 entries, 0 to 10667
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         10668 non-null  object 
 1   year          10668 non-null  int64  
 2   price         10668 non-null  int64  
 3   transmission  10668 non-null  object 
 4   mileage       10668 non-null  int64  
 5   fuelType      10668 non-null  object 
 6   mpg           10668 non-null  float64
 7   engineSize    10668 non-null  float64
dtypes: float64(2), int64(3), object(3)
memory usage: 666.9+ KB


In [4]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'mpg',
       'engineSize'],
      dtype='object')

In [5]:
# df["year"].value_counts()
# df["mileage"].value_counts()
# df["mpg"].value_counts()
# df["engineSize"].value_counts()

### Separating Feature and Target variable 


In [6]:
num_features = ['year', 'mileage', 'mpg', 'engineSize']
target = ['price']

x = df[num_features]
y = df[target]

print('Shape of x =', x.shape)
print('Shape of y =', y.shape)

Shape of x = (10668, 4)
Shape of y = (10668, 1)


### Train dataset and Test dataset separation

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=11)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(8534, 4) (2134, 4) (8534, 1) (2134, 1)


### Training: Logistic Regression

In [8]:
model = LinearRegression()
model = model.fit(x_train, y_train)

### Coefficients and C

In [9]:
coefficients = model.coef_
c = model.intercept_
print(coefficients)
print(c)

[[ 2.13712958e+03 -9.37102466e-02 -1.69539639e+02  1.09600599e+04]]
[-4298133.06101474]


In [10]:
x.columns

Index(['year', 'mileage', 'mpg', 'engineSize'], dtype='object')

In [11]:
coef_df = pd.DataFrame({'num_features' : x.columns, 'coefficients' : np.squeeze(coefficients)})

display(coef_df)

Unnamed: 0,num_features,coefficients
0,year,2137.129579
1,mileage,-0.09371
2,mpg,-169.539639
3,engineSize,10960.059907


- A positive sign indicates that as the feature variable increases, the target variable also increases.
- A negative sign indicates that as the feature variable increases, the target variable decreases.

### Price Prediction

In [12]:
y_pred = model.predict(x_test)
print(y_pred)

[[28558.85252379]
 [32067.12374236]
 [19918.95489603]
 ...
 [15401.51880938]
 [19397.20198932]
 [25478.96301734]]


In [13]:
print(y_test)

      price
9618  33990
5742  32790
811   18695
3412  12802
7686  15990
...     ...
3326  57000
8370  20000
5997  13745
3912  20990
7183  17400

[2134 rows x 1 columns]


### Error Prediction using MAE, MAPE, MSE, RMSE, & R2 as KPIs

In [14]:
# mean absolute error: lower is better
MAE = np.round(mean_absolute_error(y_test, y_pred),2)
print('MAE =', MAE)

# MAPE: Mean Absolute Percentage Error: lower is better
MAPE = np.round(mean_absolute_percentage_error(y_test, y_pred), 2)
print('MAPE =', MAPE)

# mean squared error: lower is better
MSE = np.round(mean_squared_error(y_test, y_pred, squared = True),2)
print('MSE =', MSE)

# root mean squared error: lower is better
RMSE = np.round(mean_squared_error(y_test, y_pred, squared = False),2)
print('RMSE =', RMSE)

# coefficient of determination == r_squared: greater is better. Max =1, min=-
r2 = np.round(r2_score(y_test, y_pred), 2)
print('r2 =', r2)

MAE = 3409.13
MAPE = 0.18
MSE = 29334262.3
RMSE = 5416.11
r2 = 0.75
