Import needed Libraries

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score
import pickle


Create a `dataframe`

In [34]:
df = pd.read_csv('../csv/cleaned_data_car.csv')


***Polynomial Model***

In [35]:
X, y = make_regression(n_samples=100, n_features=1, noise=30, random_state=42)

degrees = np.arange(1, 10)

cv_scores = []
for degree in degrees:
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores.append(-np.mean(scores))  # Convert to positive MSE

optimal_degree = degrees[np.argmin(cv_scores)]
print("Optimal polynomial degree:", optimal_degree)

Optimal polynomial degree: 1


In [36]:
X = df.drop('car_price', axis=1)
y = df['car_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=optimal_degree)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(X_poly_train, y_train)

y_poly_pred_train = poly_reg.predict(X_poly_train)
y_poly_pred_test = poly_reg.predict(X_poly_test)

mse_poly_train = mean_squared_error(y_train, y_poly_pred_train)
mse_poly_test = mean_squared_error(y_test, y_poly_pred_test)

r2_train = r2_score(y_train, y_poly_pred_train)
r2_test = r2_score(y_test, y_poly_pred_test)

print("R-squared for training set:", r2_train)
print("R-squared for testing set:", r2_test)

R-squared for training set: 0.8240533836277965
R-squared for testing set: -14.589246506505818


***kNN Model***

In [37]:
k_values = list(range(1, 21, 2))

cv_scores = []
for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5, scoring='neg_mean_squared_error') 
    cv_scores.append(-scores.mean())  

optimal_k = k_values[np.argmin(cv_scores)] 
print("Optimal k value:", optimal_k)

Optimal k value: 3


In [38]:
knn_reg = KNeighborsRegressor(n_neighbors=optimal_k)
knn_reg.fit(X_train, y_train)

y_knn_pred_train = knn_reg.predict(X_train)
y_knn_pred_test = knn_reg.predict(X_test)

mse_knn_train = mean_squared_error(y_train, y_knn_pred_train)
mse_knn_test = mean_squared_error(y_test, y_knn_pred_test)

r2_train = r2_score(y_train, y_knn_pred_train)
r2_test = r2_score(y_test, y_knn_pred_test)

print("R-squared for training set:", r2_train)
print("R-squared for testing set:", r2_test)

R-squared for training set: 0.6695605764451381
R-squared for testing set: 0.2923902969457144


***Decision Tree Model***

In [39]:
dt_reg = DecisionTreeRegressor(random_state=42,max_depth=10)
dt_reg.fit(X_train, y_train)

y_dt_pred_train = dt_reg.predict(X_train)
y_dt_pred_test = dt_reg.predict(X_test)

mse_dt_train = mean_squared_error(y_train, y_dt_pred_train)
mse_dt_test = mean_squared_error(y_test, y_dt_pred_test)

r2_train = r2_score(y_train, y_dt_pred_train)
r2_test = r2_score(y_test, y_dt_pred_test)

print("R-squared for training set:", r2_train)
print("R-squared for testing set:", r2_test)

R-squared for training set: 0.8535561192338307
R-squared for testing set: 0.6566421608757356


In [40]:
rmse_poly_train = np.sqrt(mse_poly_train)
rmse_poly_test = np.sqrt(mse_poly_test)
print(f'Polynomial Regression RMSE on Training Set: {rmse_poly_train}')
print(f'Polynomial Regression RMSE on Test Set: {rmse_poly_test}')

rmse_knn_train = np.sqrt(mse_knn_train)
rmse_knn_test = np.sqrt(mse_knn_test)
print(f'kNN RMSE on Training Set: {rmse_knn_train}')
print(f'kNN RMSE on Test Set: {rmse_knn_test}')


rmse_dt_train = np.sqrt(mse_dt_train)
rmse_dt_test = np.sqrt(mse_dt_test) 
print(f'Decision Tree RMSE on Training Set: {rmse_dt_train}')
print(f'Decision Tree RMSE on Test Set: {rmse_dt_test}')

Polynomial Regression RMSE on Training Set: 13240.06171979727
Polynomial Regression RMSE on Test Set: 120313.81068190443
kNN RMSE on Training Set: 18144.519754946446
kNN RMSE on Test Set: 25633.02197863929
Decision Tree RMSE on Training Set: 12079.11634939182
Decision Tree RMSE on Test Set: 17855.680393569648


In [47]:
model_and_columns = {'model': dt_reg, 'columns': X_train.columns}
pickle.dump(model_and_columns, open('decision_tree_with_columns.sav', 'wb'))