In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [23]:
data = pd.read_csv("all_car_adverts.csv")

In [24]:
# Sample n random rows... FOR QUICK TESTING ONLY
# REMOVE THIS CELL FOR FULL TESTING
data = data.sample(n=20000, random_state=42)
print(f"Dataset sampled: {data.shape[0]} rows and {data.shape[1]} columns")

Dataset sampled: 20000 rows and 32 columns


In [25]:
# Create a new target variable by binning 'car_price' into categories, as Naive Bayes requires a categorical target variable.
# Prices above the median are 'expensive' and below the median are 'cheap'.
median_price = data['car_price'].median()
data['price_category'] = data['car_price'].apply(
    lambda x: 'expensive' if x > median_price else 'cheap'
)    

In [26]:
# Define features and target.
# Drop the original price (not needed)and the derived target from features.
X = data.drop(['car_price', 'price_category'], axis=1)
y = data['price_category']

In [27]:
# Identify numeric and categorical columns based on data types.
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [28]:
# Preprocessor (pretty much same as in the previous Alorithms).
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')), # Impute missing values with the mean.
        ('scale', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value.
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # One-hot encode the categorical columns.
    ]), categorical_cols)
])

In [29]:

# Create a pipeline that chains the preprocessor and the Gaussian Naive Bayes classifier.
pipeline = Pipeline(steps=[
    ('pre', preprocessor),
    ('nb', GaussianNB())
])

In [30]:
# Split Train Test
print("Splitting the dataset...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split complete.")

# Fit the Naive Bayes model on the training data and make predictions on the test set.
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

Splitting the dataset...
Dataset split complete.
              precision    recall  f1-score   support

       cheap       0.90      0.70      0.78      1977
   expensive       0.76      0.92      0.83      2023

    accuracy                           0.81      4000
   macro avg       0.83      0.81      0.81      4000
weighted avg       0.83      0.81      0.81      4000

