In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('vehical.csv')
df.head(3)

Unnamed: 0,Brand,Year,Model,Car/Suv,Title,UsedOrNew,Transmission,Engine,DriveType,FuelType,FuelConsumption,Kilometres,ColourExtInt,Location,CylindersinEngine,BodyType,Doors,Seats,Price
0,Ssangyong,2022.0,Rexton,Sutherland Isuzu Ute,2022 Ssangyong Rexton Ultimate (awd),DEMO,Automatic,"4 cyl, 2.2 L",AWD,Diesel,8.7 L / 100 km,5595,White / Black,"Caringbah, NSW",4 cyl,SUV,4 Doors,7 Seats,51990
1,MG,2022.0,MG3,Hatchback,2022 MG MG3 Auto Excite (with Navigation),USED,Automatic,"4 cyl, 1.5 L",Front,Premium,6.7 L / 100 km,16,Black / Black,"Brookvale, NSW",4 cyl,Hatchback,5 Doors,5 Seats,19990
2,BMW,2022.0,430I,Coupe,2022 BMW 430I M Sport,USED,Automatic,"4 cyl, 2 L",Rear,Premium,6.6 L / 100 km,8472,Grey / White,"Sylvania, NSW",4 cyl,Coupe,2 Doors,4 Seats,108988


In [3]:
# Droping unnecessary columns and NA values
df.drop(columns=['Title', "Brand", "Car/Suv", "Location", "Engine", "ColourExtInt"], inplace=True)
df.dropna(inplace=True)
df.isnull().sum()

Year                 0
Model                0
UsedOrNew            0
Transmission         0
DriveType            0
FuelType             0
FuelConsumption      0
Kilometres           0
CylindersinEngine    0
BodyType             0
Doors                0
Seats                0
Price                0
dtype: int64

In [4]:
# Converting columns from obj to int/float

def from_cat_to_num(df: pd.DataFrame, col: str, dtypes) -> pd.DataFrame:
    if dtypes == int:
        df[col] = df[col].str.replace('[^0-9]', '', regex=True).astype(dtypes)
    else:
       df[col] = df[col].str.extract(r'(\d+\.?\d*)').astype(float)
       
int_cols = ['Seats', 'Doors', 'CylindersinEngine']
float_cols = ['FuelConsumption']

for col in int_cols:
    from_cat_to_num(df, col, int)

for col in float_cols:
    from_cat_to_num(df, col, float)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15028 entries, 0 to 16733
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               15028 non-null  float64
 1   Model              15028 non-null  object 
 2   UsedOrNew          15028 non-null  object 
 3   Transmission       15028 non-null  object 
 4   DriveType          15028 non-null  object 
 5   FuelType           15028 non-null  object 
 6   FuelConsumption    15028 non-null  float64
 7   Kilometres         15028 non-null  object 
 8   CylindersinEngine  15028 non-null  int64  
 9   BodyType           15028 non-null  object 
 10  Doors              15028 non-null  int64  
 11  Seats              15028 non-null  int64  
 12  Price              15028 non-null  object 
dtypes: float64(2), int64(3), object(8)
memory usage: 1.6+ MB


In [5]:
# Build a mask to remove "-" values from df
mask = df.apply(lambda col: col.astype(str).str.contains('-')).any(axis=1)
df = df[~mask]

In [6]:
# Removing all "POA" values from Price column
df = df[df['Price'] != 'POA'].reset_index(drop=True)

In [7]:
# Convert Kilometres and Price columns to int
int_cols = ['Kilometres', 'Price', 'Year']

for col in int_cols:
    df[col] = df[col].astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12892 entries, 0 to 12891
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               12892 non-null  int64  
 1   Model              12892 non-null  object 
 2   UsedOrNew          12892 non-null  object 
 3   Transmission       12892 non-null  object 
 4   DriveType          12892 non-null  object 
 5   FuelType           12892 non-null  object 
 6   FuelConsumption    12892 non-null  float64
 7   Kilometres         12892 non-null  int64  
 8   CylindersinEngine  12892 non-null  int64  
 9   BodyType           12892 non-null  object 
 10  Doors              12892 non-null  int64  
 11  Seats              12892 non-null  int64  
 12  Price              12892 non-null  int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 1.3+ MB


In [8]:
# Removing outliers
df = df[df['Year'] > 1990]
df = df[(df['FuelConsumption'] >= 1) & (df['FuelConsumption'] < 25)]
df = df[df['CylindersinEngine'] > 1]
df = df[df['Seats'] < 15]
df.reset_index(drop=True, inplace=True)

In [9]:
# Showing numer of unique value for each column
categorical_cols = ['Model', "UsedOrNew", "Transmission", "DriveType", "FuelType", "BodyType"]

for col in categorical_cols:
    print(len(df[col].unique()))

571
3
2
5
5
10


In [10]:
# Using OneHotEncoder to encode categorical values (Less then 7 unique values in column)
from sklearn.preprocessing import OneHotEncoder

categorical_cols_for_one_hot = ["UsedOrNew", "Transmission", "DriveType", "FuelType"]

one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = one_hot_encoder.fit_transform(df[categorical_cols_for_one_hot])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(categorical_cols_for_one_hot))

df_encoded = pd.concat([df.drop(categorical_cols_for_one_hot, axis=1),
                        one_hot_df], axis=1)

In [11]:
# Using LabelEncoder to encode categorical values (More then 7 unique values in column)
from sklearn.preprocessing import LabelEncoder

categorical_cols_for_label_encoder = ['Model', "BodyType"]

label_encoder = LabelEncoder()
df_encoded["Model"] = label_encoder.fit_transform(df["Model"])
df_encoded['BodyType'] = label_encoder.fit_transform(df['BodyType'])

In [12]:
# Spliting data for training and testing  
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns='Price')
y = df_encoded['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# This script performs hyperparameter tuning for an XGBoost regression model using GridSearchCV with
# cross-validation to find the best parameter combination for minimizing mean squared error.
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

reg = XGBRegressor(n_estimators=1000, objective='reg:squarederror', random_state=1234)

param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
}

grid_search = GridSearchCV(estimator=reg,
                           param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2,
                           scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [15]:
# Showing the best model
print(grid_search.best_params_)
print(grid_search.best_score_)

{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 1000, 'subsample': 1.0}
-122932149.5726471


In [18]:
# Predict on test set
best_model = grid_search.best_estimator_
reg_pred = best_model.predict(X_test)

In [19]:
# Testing the model on the following metrics
# - Mean Squared Error
# - Mean Absolute Error
# - Root Mean Squared Error
# - R2 Score
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

print(f'Mean Squared Error: {mean_squared_error(y_test, reg_pred)}')
print(f'Mean Absolute Error: {root_mean_squared_error(y_test, reg_pred)}')
print(f'Root Mean Squared Error: {mean_absolute_error(y_test, reg_pred)}')
print(f'R2 Score: {r2_score(y_test, reg_pred)}')

Mean Squared Error: 83371906.370383
Mean Absolute Error: 9130.821779576196
Root Mean Squared Error: 4334.625943730793
R2 Score: 0.8935127699196715
