In [21]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [14]:
df = pd.read_csv("car_price.csv", sep=",", encoding="utf-8")

In [15]:
SEED = 7
np.random.seed(SEED)

# Adjusting data

In [16]:
df.drop(columns=['car_ID', 'curbweight', 'highwaympg'], inplace=True)
# Identificator doesn't help in any conclusion
# curbweight and highwaympg have a high correlation to other variables

In [17]:
## Replacing car name by car brand,
## Because the frequency of each model is very low

# I will persiste only the brand
car_brand = lambda x: x.split(' ')[0].lower()
df = df.assign(brand=df.CarName.apply(car_brand))

# Fixing wrong written values
fix_values = {"maxda": "mazda", 
              "porcshce": "porsche", 
              "toyouta": "toyota", 
              "vokswagen": "volkswagen", 
              "vw": "volkswagen"
              }

fix_car_brand = lambda x: fix_values[x] if x in fix_values.keys() else x
df.brand = df.brand.apply(fix_car_brand)

# Its name isn't important anymore
df = df.drop(["CarName"], axis=1)

In [18]:
# transforming the number of doors into integer variable
doors_to_int = {"two": 2, "four":4}
df.doornumber = df.doornumber.map(doors_to_int)

# Model

In [25]:
categoric_columns = df.select_dtypes(include=["object"]).columns

# Encoding categoric variables
df = pd.get_dummies(df, columns=categoric_columns)

x = df.drop(["price"], axis=1)
y = df.price 

In [26]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.25)

In [27]:
lr = LinearRegression()
lr.fit(train_x, train_y)

In [28]:
lr.score(test_x, test_y)

0.8623829378478416

In [44]:
predictions = lr.predict(test_x)
real = test_y.to_numpy()

dif_list = list()
max_error, min_error = 0, float("inf")

for r, p in zip(real, predictions):
    dif = abs(r - p)
    dif = round(dif, 2)
    dif_list.append(dif)
    max_error = dif if dif > max_error else max_error
    min_error = dif if dif < min_error else min_error


avg_error = round(sum(dif_list) / len(dif_list), 2)
print(f"Test size = {test_x.shape[0]}\nAVG error = {avg_error}\nMax Error = {max_error}\nMin error = {min_error}")

Test size = 52
AVG error = 2002.52
Max Error = 9713.98
Min error = 5.2
