In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import random



from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor

import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve

from math import sqrt

In [2]:
# Loading the dataset
df = pd.read_csv("../../Dataset/cleaned_used_cars_v3.csv")
print(df.columns)


Index(['seller', 'offerType', 'price', 'abtest', 'gearbox', 'powerPS',
       'kilometer', 'notRepairedDamage', 'vehicleType.andere',
       'vehicleType.bus',
       ...
       'model.yeti', 'model.ypsilon', 'model.z_reihe', 'model.zafira',
       'year_range.[1991, 1996)', 'year_range.[1996, 2001)',
       'year_range.[2001, 2006)', 'year_range.[2006, 2011)',
       'year_range.[2011, 2016)', 'year_range.[2016, 2021)'],
      dtype='object', length=313)


In [3]:
X = df.drop(['price'], axis=1)
y = df['price']
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
models = {
    "Linear Regressor":LinearRegression(),
    "Ridge Regressor":Ridge(.0001),

}

In [10]:
# Looping through each model, fitting it to the training data, and evaluating its performance
for name, model in models.items():
    # Fitting the model to the training data
    history=model.fit(X_train, y_train)
    
    # Making predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluating the model's performance using mean squared error and R-squared value
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: Mean squared error: {mse}, R-squared value: {r2}")
    print('mean_absolute_error = ', mean_absolute_error(y_test,y_pred))
    print('R2 score is ',r2_score(y_test, y_pred))
    print('mean_squared_error is ', mean_squared_error(y_test, y_pred))
    rmse = sqrt(mse)
    print('Root mean squared error:', rmse)
     # Printing a sample of predicted and actual prices
    print("Sample of predicted and actual prices:")
    sample_indices = random.sample(range(len(y_test)), 5)
    for i in sample_indices:
        print(f"Predicted price: {y_pred[i]}, Actual price: {y_test.iloc[i]}")
        

    # Evaluating the model's performance using cross-validation
    scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
    print(f"Cross-validation scores for {name}: {np.sqrt(-scores)}, Mean RMSE: {np.sqrt(-scores.mean())}\n")

Linear Regressor: Mean squared error: 0.003066635258874616, R-squared value: 0.7805515217680901
mean_absolute_error =  0.04023987693837992
R2 score is  0.7805515217680901
mean_squared_error is  0.003066635258874616
Root mean squared error: 0.05537720883968978
Sample of predicted and actual prices:
Predicted price: 2.102360725402832, Actual price: 2.1179272289544824
Predicted price: 2.278902530670166, Actual price: 2.264955957558561
Predicted price: 2.257345199584961, Actual price: 2.312708916871605
Predicted price: 2.0802364349365234, Actual price: 2.1179272289544824
Predicted price: 2.3527674674987793, Actual price: 2.365427987015708
Cross-validation scores for Linear Regressor: [5.51669481e-02 5.62018850e-02 9.24967582e+00 1.18529300e+08
 5.58054347e-02], Mean RMSE: 53007914.27594136

Ridge Regressor: Mean squared error: 0.0030666252541725635, R-squared value: 0.7805522377047618
mean_absolute_error =  0.04024121036586947
R2 score is  0.7805522377047618
mean_squared_error is  0.003066

In [6]:
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
reg.fit(X_train, y_train)

In [7]:
reg.alpha_

0.1

In [8]:
y_pred = reg.predict(X_test)
    
# Evaluating the model's performance using mean squared error and R-squared value
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"{name}: Mean squared error: {mse}, R-squared value: {r2}")
print('mean_absolute_error = ', mean_absolute_error(y_test,y_pred))
print('R2 score is ',r2_score(y_test, y_pred))
print('mean_squared_error is ', mean_squared_error(y_test, y_pred))
 # Printing a sample of predicted and actual prices
print("Sample of predicted and actual prices:")
sample_indices = random.sample(range(len(y_test)), 5)
for i in sample_indices:
    print(f"Predicted price: {y_pred[i]}, Actual price: {y_test.iloc[i]}")
    
# Evaluating the model's performance using cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
print(f"Cross-validation scores for {name}: {np.sqrt(-scores)}, Mean RMSE: {np.sqrt(-scores.mean())}\n")

Ridge Regressor: Mean squared error: 0.003066793156190158, R-squared value: 0.7805402226331466
mean_absolute_error =  0.0402420481100382
R2 score is  0.7805402226331466
mean_squared_error is  0.003066793156190158
Sample of predicted and actual prices:
Predicted price: 2.023781023261062, Actual price: 1.9763847732858
Predicted price: 2.057395802105476, Actual price: 1.9763847732858
Predicted price: 2.3220376222173145, Actual price: 2.3000563597136088
Predicted price: 2.2787986880922553, Actual price: 2.214933939131339
Predicted price: 2.243343389275317, Actual price: 2.2921070499349456
Cross-validation scores for Ridge Regressor: [0.05516555 0.05620015 0.05596832 0.05555538 0.05580291], Mean RMSE: 0.05573959624905057

