In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [48]:
# Load the dataset
data = pd.read_csv('all_car_adverts.csv')

In [49]:
# Sample n random rows... FOR QUICK TESTING ONLY
# REMOVE THIS CELL FOR FULL TESTING
data = data.sample(n=100000, random_state=42)

print(f"Dataset sampled: {data.shape[0]} rows and {data.shape[1]} columns")

Dataset sampled: 100000 rows and 32 columns


In [50]:
# Define the target column and separate features from the target
target_column = 'car_price'
X = data.drop(target_column, axis=1)
y = data[target_column]


In [51]:
# Identify numeric and categorical features
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [52]:
# Define a preprocessor with imputation, scaling, and encoding
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')), # Fill missing values with the mean (average) of that column
        ('scale', StandardScaler()) # Standardize the features by removing the mean and scaling to unit variance
    ]), numeric_cols),
    ('cat', Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')), # Fill missing values with the most frequent value of that column
        ('onehot', OneHotEncoder(handle_unknown='ignore')) # Encoding Caregorical features into binary. hanfle_unknown='ignore' will ignore any unknown categories that may appear in the test set
    ]), categorical_cols)
])

In [55]:
# Create a pipeline with the preprocessor and a KNN regressor
pipeline = Pipeline(steps=[
    ('pre', preprocessor),
    ('knn', KNeighborsRegressor(n_neighbors=10)) # n_neighbors=10 is the number of neighbors to consider
])

In [56]:
# Split the data into training and testing sets
print("Splitting the dataset...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split complete.")

Splitting the dataset...
Dataset split complete.


In [57]:
# Train the KNN model
print("Training the KNN model...")
pipeline.fit(X_train, y_train)
print("Training complete.") # Printing lets me know if our model is actually executing

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Training the KNN model...
Training complete.
Mean Squared Error: 192867090.44
R^2 Score: 0.64
