### Importing libraries to be used

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

### Reading the dataset

In [2]:
df = pd.read_csv("car_price_dataset.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         10000 non-null  object 
 1   Model         10000 non-null  object 
 2   Year          10000 non-null  int64  
 3   Engine_Size   10000 non-null  float64
 4   Fuel_Type     10000 non-null  object 
 5   Transmission  10000 non-null  object 
 6   Mileage       10000 non-null  int64  
 7   Doors         10000 non-null  int64  
 8   Owner_Count   10000 non-null  int64  
 9   Price         10000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 781.4+ KB
None


### Converting object values ​​to categorical values

In [3]:
df["Brand"] = df["Brand"].astype("category")
df["Model"] = df["Model"].astype("category")
df["Fuel_Type"] = df["Fuel_Type"].astype("category")
df["Transmission"] = df["Transmission"].astype("category")

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Brand         10000 non-null  category
 1   Model         10000 non-null  category
 2   Year          10000 non-null  int64   
 3   Engine_Size   10000 non-null  float64 
 4   Fuel_Type     10000 non-null  category
 5   Transmission  10000 non-null  category
 6   Mileage       10000 non-null  int64   
 7   Doors         10000 non-null  int64   
 8   Owner_Count   10000 non-null  int64   
 9   Price         10000 non-null  int64   
dtypes: category(4), float64(1), int64(5)
memory usage: 509.9 KB
None


### Preparing the columns for the model

In [5]:
categorical_features = ["Brand", "Model", "Fuel_Type", "Transmission"]
numerical_features = ["Year", "Engine_Size", "Mileage", "Doors", "Owner_Count"]

In [6]:
full_pipeline = ColumnTransformer([("num", StandardScaler(), numerical_features),
                                   ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)])

### Splitting the dataset for the model

In [7]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Training the model

In [9]:
model = Pipeline([("preparation", full_pipeline),
                  ("model", LinearRegression())])

In [10]:
model.fit(X_train, y_train)

### Examining the model prediction results

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"rmse: {rmse}, r^2 score: {r2}")

rmse: 92.09112910396324, r^^2 score: 0.9991109117850725


### Checking the model prediction

In [12]:
print(df.head())

        Brand   Model  Year  Engine_Size Fuel_Type    Transmission  Mileage  \
0         Kia     Rio  2020          4.2    Diesel          Manual   289944   
1   Chevrolet  Malibu  2012          2.0    Hybrid       Automatic     5356   
2    Mercedes     GLA  2020          4.2    Diesel       Automatic   231440   
3        Audi      Q5  2023          2.0  Electric          Manual   160971   
4  Volkswagen    Golf  2003          2.6    Hybrid  Semi-Automatic   286618   

   Doors  Owner_Count  Price  
0      3            5   8501  
1      2            3  12092  
2      4            2  11171  
3      2            1  11780  
4      3            3   2867  


In [13]:
new_data = pd.DataFrame({"Brand" : ["Audi"],
                         "Model" : ["Q5"],
                         "Year" : [2023],
                         "Engine_Size" : [2.0],
                         "Fuel_Type" : ["Electric"],
                         "Transmission" : ["Manual"],
                         "Mileage" : [160971],
                         "Doors" : [2],
                         "Owner_Count" : [1]})

print(model.predict(new_data))

[11771.60601576]
