# Using the carprice dataset, the goal is to predict the price of the car base on other features

In [29]:
import numpy as np
import pandas as pd
import sklearn

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [5]:
Data = pd.read_csv("C:\\CarPrice.csv")
Data.drop(columns=['car_ID', 'symboling', 'CarName'], inplace=True)
Data.head()

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [31]:
Data.shape
# Data.info()

(205, 23)

In [10]:
Data[Data.isnull()].any()

fueltype            False
aspiration          False
doornumber          False
carbody             False
drivewheel          False
enginelocation      False
wheelbase           False
carlength           False
carwidth            False
carheight           False
curbweight          False
enginetype          False
cylindernumber      False
enginesize          False
fuelsystem          False
boreratio           False
stroke              False
compressionratio    False
horsepower          False
peakrpm             False
citympg             False
highwaympg          False
price               False
dtype: bool

In [6]:
Data.columns

Index(['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel',
       'enginelocation', 'wheelbase', 'carlength', 'carwidth', 'carheight',
       'curbweight', 'enginetype', 'cylindernumber', 'enginesize',
       'fuelsystem', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
       'peakrpm', 'citympg', 'highwaympg', 'price'],
      dtype='object')

In [8]:
X = Data.drop(columns=['price'])
y = Data['price']
X.shape

(205, 22)

In [30]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8, random_state=42)

num_features = X.select_dtypes(include=['float64','int64']).columns
cat_features = X.select_dtypes(include=['object']).columns

num_transformer = StandardScaler()
Cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features),('cat',Cat_transformer, cat_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('forest', RandomForestRegressor())])

pipeline.fit(Xtrain, ytrain)
y_pred = pipeline.predict(Xtest)

rmse = np.sqrt(mean_squared_error(ytest, y_pred))
r2 = r2_score(ytest, y_pred)
mae = mean_absolute_error(ytest, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

print(f"R^2 Score: {r2}")

RMSE: 1808.7336965993525
MAE: 1262.4639756097558
R^2 Score: 0.95855902006561
