Import needed Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score

Create a `dataframe`

In [2]:
df = pd.read_csv('cleaned_cars.csv')

## ***Polynomial Model***

Choosing the `optimal_degree`

In [3]:
X, y = make_regression(n_samples=100, n_features=1, noise=30, random_state=42)

degrees = np.arange(1, 10)

cv_scores = []
for degree in degrees:
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores.append(-np.mean(scores))  # Convert to positive MSE

optimal_degree = degrees[np.argmin(cv_scores)]
print("Optimal polynomial degree:", optimal_degree)

Optimal polynomial degree: 1


Applying the model

In [4]:
X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=optimal_degree)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(X_poly_train, y_train)

y_poly_pred_train = poly_reg.predict(X_poly_train)
y_poly_pred_test = poly_reg.predict(X_poly_test)

mse_poly_train = mean_squared_error(y_train, y_poly_pred_train)
mse_poly_test = mean_squared_error(y_test, y_poly_pred_test)

## ***kNN Model***

Choose the `optimal_k` for the model.

In [5]:
k_values = list(range(1, 21, 2))

cv_scores = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

optimal_k = k_values[np.argmax(cv_scores)]
print("Optimal k value:", optimal_k)



Optimal k value: 1


Applying the model

In [6]:
knn_reg = KNeighborsRegressor(n_neighbors=optimal_k)
knn_reg.fit(X_train, y_train)

y_knn_pred_train = knn_reg.predict(X_train)
y_knn_pred_test = knn_reg.predict(X_test)

mse_knn_train = mean_squared_error(y_train, y_knn_pred_train)
mse_knn_test = mean_squared_error(y_test, y_knn_pred_test)

## ***Decision Tree Model***

Applying the model

In [7]:
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

y_dt_pred_train = dt_reg.predict(X_train)
y_dt_pred_test = dt_reg.predict(X_test)

mse_dt_train = mean_squared_error(y_train, y_dt_pred_train)
mse_dt_test = mean_squared_error(y_test, y_dt_pred_test)

## **Evaluation**

### RMSE Evaluation 

In [8]:
rmse_poly_train = np.sqrt(mse_poly_train)
rmse_poly_test = np.sqrt(mse_poly_test)
print(f'Polynomial Regression RMSE on Training Set: {rmse_poly_train}')
print(f'Polynomial Regression RMSE on Testing Set: {rmse_poly_test}')

rmse_knn_train = np.sqrt(mse_knn_train)
rmse_knn_test = np.sqrt(mse_knn_test)
print(f'kNN RMSE on Training Set: {rmse_knn_train}')
print(f'kNN RMSE on Testing Set: {rmse_knn_test}')

rmse_dt_train = np.sqrt(mse_dt_train)
rmse_dt_test = np.sqrt(mse_dt_test)
print(f'Decision Tree RMSE on Training Set: {rmse_dt_train}')
print(f'Decision Tree RMSE on Testing Set: {rmse_dt_test}')

Polynomial Regression RMSE on Training Set: 33660.77838540121
Polynomial Regression RMSE on Testing Set: 21906460564.19311
kNN RMSE on Training Set: 12832.808237146432
kNN RMSE on Testing Set: 62101.135468897526
Decision Tree RMSE on Training Set: 9198.677402846764
Decision Tree RMSE on Testing Set: 59425.958549534684


### R-square Evaluation

In [9]:
r2_poly_train = r2_score(y_train, y_poly_pred_train)
r2_poly_test = r2_score(y_test, y_poly_pred_test)
print("Polynomial R2 Score on Training Set:", r2_poly_train)
print("Polynomial R2 Score on Training Set:", r2_poly_test)

r2_knn_train = r2_score(y_train, y_knn_pred_train)
r2_knn_test = r2_score(y_test, y_knn_pred_test)
print("kNN R2 Score on Training Set:", r2_knn_train)
print("kNN R2 Score on Training Set:", r2_knn_test)

r2_dt_train = r2_score(y_train, y_dt_pred_train)
r2_dt_test = r2_score(y_test, y_dt_pred_test)
print("DT R2 Score on Training Set:", r2_dt_train)
print("DT R2 Score on Training Set:", r2_dt_test)

Polynomial R2 Score on Training Set: 0.5724697551674993
Polynomial R2 Score on Training Set: -140507078225.96735
kNN R2 Score on Training Set: 0.9378613314225563
kNN R2 Score on Training Set: -0.1291531663382226
DT R2 Score on Training Set: 0.9680721766977209
DT R2 Score on Training Set: -0.03396578119243143


### Adjusted R-square Evaluation

In [10]:
def adjusted_r2_score(r2, n, k):
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))

n_poly_train = len(y_train)
k_poly = X_poly_train.shape[1]
adj_r2_poly_train = adjusted_r2_score(r2_poly_train, n_poly_train, k_poly)
adj_r2_poly_test = adjusted_r2_score(r2_poly_test, len(y_test), k_poly)
print("Adjusted R2 Score for Polynomial Regression on Training Set:", adj_r2_poly_train)
print("Adjusted R2 Score for Polynomial Regression on Testing Set:", adj_r2_poly_test)

n_knn_train = len(y_train)
k_knn = X_train.shape[1] 
adj_r2_knn_train = adjusted_r2_score(r2_knn_train, n_knn_train, k_knn)
adj_r2_knn_test = adjusted_r2_score(r2_knn_test, len(y_test), k_knn)
print("Adjusted R2 Score for kNN Regression on Training Set:", adj_r2_knn_train)
print("Adjusted R2 Score for kNN Regression on Testing Set:", adj_r2_knn_test)

n_dt_train = len(y_train)
k_dt = X_train.shape[1] 
adj_r2_dt_train = adjusted_r2_score(r2_dt_train, n_dt_train, k_dt)
adj_r2_dt_test = adjusted_r2_score(r2_dt_test, len(y_test), k_dt)
print("Adjusted R2 Score for Decision Tree Regression on Training Set:", adj_r2_dt_train)
print("Adjusted R2 Score for Decision Tree Regression on Testing Set:", adj_r2_dt_test)

Adjusted R2 Score for Polynomial Regression on Training Set: 0.5286497179578682
Adjusted R2 Score for Polynomial Regression on Testing Set: -223748666587.32196
Adjusted R2 Score for kNN Regression on Training Set: 0.9315089772220522
Adjusted R2 Score for kNN Regression on Testing Set: -0.7955904197574255
Adjusted R2 Score for Decision Tree Regression on Training Set: 0.964808237396955
Adjusted R2 Score for Decision Tree Regression on Testing Set: -0.6442225079941182


### Based on the previous evaluations, the Decision Tree Model is the best one between the models