### Import Libraries

In [44]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score


### (A) Loading Dataset

In [45]:
df = pd.read_csv('ford.csv')
print(df.head())
print(df.info())

     model  year  price transmission  mileage fuelType   mpg  engineSize
0   Fiesta  2017  12000    Automatic    15944   Petrol  57.7         1.0
1    Focus  2018  14000       Manual     9083   Petrol  57.7         1.0
2    Focus  2017  13000       Manual    12456   Petrol  57.7         1.0
3   Fiesta  2019  17500       Manual    10460   Petrol  40.3         1.5
4   Fiesta  2019  16500    Automatic     1482   Petrol  48.7         1.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17964 non-null  object 
 1   year          17964 non-null  int64  
 2   price         17964 non-null  int64  
 3   transmission  17964 non-null  object 
 4   mileage       17964 non-null  int64  
 5   fuelType      17964 non-null  object 
 6   mpg           17964 non-null  float64
 7   engineSize    17964 non-null  float64
dtypes: float64(2), int6

In [46]:
print(df.describe())

               year         price        mileage           mpg    engineSize
count  17964.000000  17964.000000   17964.000000  17964.000000  17964.000000
mean    2016.864173  12280.078435   23361.880149     57.907832      1.350824
std        2.024987   4741.318119   19471.243292     10.125632      0.432383
min     1996.000000    495.000000       1.000000     20.800000      0.000000
25%     2016.000000   8999.000000    9987.000000     52.300000      1.000000
50%     2017.000000  11291.000000   18242.500000     58.900000      1.200000
75%     2018.000000  15299.000000   31052.000000     65.700000      1.500000
max     2020.000000  54995.000000  177644.000000    201.800000      5.000000


### (B) Separating the numeric features and target variable

In [47]:
features = ['year','mileage','mpg','engineSize']
target = ['price']
X = df[features]
Y = df [target]
print('shape of X = ', X.shape , '\nshape of Y = ', Y.shape)

shape of X =  (17964, 4) 
shape of Y =  (17964, 1)


### (C) Split the original dataset into the train set (80%) and the test set (20%)

In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 13)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(14371, 4) (3593, 4) (14371, 1) (3593, 1)


### (D) Perform Linear Regression and Predict the 'Price' from the test set

In [49]:
model = LinearRegression()
model = model.fit(X_train, Y_train)

print( 'intercept = ', model.intercept_, '\ncoefficients = ', model.coef_)

intercept =  [-2621611.14946318] 
coefficients =  [[ 1.30546764e+03 -5.12279548e-02 -8.69539675e+01  5.30526307e+03]]


### Prediction

In [50]:
Y_pred = model.predict(X_test)
print(Y_pred)

[[12606.65131439]
 [15093.53607394]
 [12699.88607429]
 ...
 [15417.00766717]
 [14736.2271355 ]
 [ 7114.56553147]]


### (E) Find the RMSE, MAE, MAPE, MSE, and R squared KPIs from the actual test data and the predicted data

In [51]:
RMSE = mean_squared_error(Y_test,Y_pred, squared = False)
MAE = mean_absolute_error(Y_test, Y_pred)
MAPE = mean_absolute_percentage_error(Y_test, Y_pred)
MSE = mean_squared_error(Y_test, Y_pred, squared = True)
R_squared = r2_score(Y_test, Y_pred)

print('RMSE = ', RMSE, '\nMAE = ',MAE, '\nMAPE =', MAPE, '\nMSE =', MSE, '\nR_squared = ', R_squared)

RMSE =  2513.4976844291205 
MAE =  1792.2681729845815 
MAPE = 0.17732865042083468 
MSE = 6317670.609630549 
R_squared =  0.7393574763804996
