### **Import Libraries**

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### **Data Loading and Overview**

In [4]:
df=pd.read_csv("../data/processed/Prepared_turbo_az.csv")

In [5]:
df.head()

Unnamed: 0,Price,Make,Year,Color,Kilometer,Transmission,New,Engine_Size,Horsepower,Fuel_Type,...,GLS 450 4MATIC,Grandeur,K5,Santa Fe,Sonata,Sorento,Sportage,Tucson,X5,X7
0,72100.0,2,2024,3,-0.860128,0,0,0.5,0.0,0,...,0,0,1,0,0,0,0,0,0,0
1,41000.0,2,2020,9,1.54823,0,1,-0.4,-0.12963,0,...,0,0,1,0,0,0,0,0,0,0
2,24500.0,2,2019,4,1.115507,3,1,0.0,-0.435185,0,...,0,0,0,0,0,0,0,0,0,0
3,321100.0,3,2019,8,-0.109471,0,1,2.0,3.62037,0,...,0,0,0,0,0,0,0,0,0,0
4,79900.0,0,2022,8,-0.469161,0,1,0.0,0.907407,3,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1391 entries, 0 to 1390
Data columns (total 34 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Price           1391 non-null   float64
 1   Make            1391 non-null   int64  
 2   Year            1391 non-null   int64  
 3   Color           1391 non-null   int64  
 4   Kilometer       1391 non-null   float64
 5   Transmission    1391 non-null   int64  
 6   New             1391 non-null   int64  
 7   Engine_Size     1391 non-null   float64
 8   Horsepower      1391 non-null   float64
 9   Fuel_Type       1391 non-null   int64  
 10  330             1391 non-null   int64  
 11  520             1391 non-null   int64  
 12  530             1391 non-null   int64  
 13  530e            1391 non-null   int64  
 14  Accent          1391 non-null   int64  
 15  Carnival        1391 non-null   int64  
 16  Cerato          1391 non-null   int64  
 17  E 220 d         1391 non-null   i

### **Model Training and Evaluation**

In [7]:
X = df.drop("Price", axis=1)
y = df.Price

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [9]:
def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)


    print(f"      MAE: {mae:.2f}")
    print(f"      MSE: {mse:.2f}")
    print(f"      RMSE: {rmse:.2f}")
    print(f"      R²: {r2:.4f}")

In [10]:
model = LinearRegression()

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("=== Linear Regression (Training Set) ===")
regression_metrics(y_train, y_train_pred)
print('-' * 50)
print("=== Linear Regression (Test Set) ===")
regression_metrics(y_test, y_test_pred)


=== Linear Regression (Training Set) ===
      MAE: 11407.26
      MSE: 346276839.87
      RMSE: 18608.52
      R²: 0.9299
--------------------------------------------------
=== Linear Regression (Test Set) ===
      MAE: 12892.31
      MSE: 350324604.62
      RMSE: 18716.96
      R²: 0.9279


In [11]:
model = KNeighborsRegressor()

param_grid = {
    'n_neighbors': [4, 5, 6],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print(f"Best Parameters: {grid_search.best_params_}")
print("\n=== K-Nearest Neighbors (Training Set) ===")
regression_metrics(y_train, y_train_pred)
print('-' * 50)
print("=== K-Nearest Neighbors (Test Set) ===")
regression_metrics(y_test, y_test_pred)

Best Parameters: {'algorithm': 'auto', 'n_neighbors': 4, 'weights': 'distance'}

=== K-Nearest Neighbors (Training Set) ===
      MAE: 262.54
      MSE: 4336631.51
      RMSE: 2082.46
      R²: 0.9991
--------------------------------------------------
=== K-Nearest Neighbors (Test Set) ===
      MAE: 7095.03
      MSE: 122505841.10
      RMSE: 11068.24
      R²: 0.9748


In [12]:
model = DecisionTreeRegressor(random_state=0)

param_grid = {
    "max_depth": [5, 10, 15],
    "min_samples_split": [5, 10, 15],
    "min_samples_leaf": [2, 5, 10] 
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print(f"Best Parameters: {grid_search.best_params_}")
print("\n=== Decision Tree Regression (Training Set) ===")
regression_metrics(y_train, y_train_pred)
print('-' * 50)
print("=== Decision Tree Regression (Test Set) ===")
regression_metrics(y_test, y_test_pred)

Best Parameters: {'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 15}

=== Decision Tree Regression (Training Set) ===
      MAE: 6301.02
      MSE: 196140787.11
      RMSE: 14005.03
      R²: 0.9603
--------------------------------------------------
=== Decision Tree Regression (Test Set) ===
      MAE: 8805.81
      MSE: 222942017.72
      RMSE: 14931.24
      R²: 0.9541


In [13]:
model = RandomForestRegressor(random_state=0)

param_grid = {
    "n_estimators": [200, 300],
    "max_depth": [15, 20, None],
    "min_samples_split": [2, 5, 8],
    "min_samples_leaf": [1, 2, 3],
    "max_features": ["sqrt", "log2"]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1,)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print(f"Best Parameters: {grid_search.best_params_}")
print("\n=== Random Forest (Training Set) ===")
regression_metrics(y_train, y_train_pred)
print('-' * 50)
print("=== Random Forest (Test Set) ===")
regression_metrics(y_test, y_test_pred)


Best Parameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

=== Random Forest (Training Set) ===
      MAE: 3296.69
      MSE: 39325557.46
      RMSE: 6271.01
      R²: 0.9920
--------------------------------------------------
=== Random Forest (Test Set) ===
      MAE: 6809.77
      MSE: 123882781.59
      RMSE: 11130.26
      R²: 0.9745


In [None]:
base_models = [
    #('linear_reg', LinearRegression()),
    ('knn', KNeighborsRegressor(n_neighbors=4, weights='distance', algorithm='auto')),
    ('dt', DecisionTreeRegressor(random_state=0, max_depth=15, min_samples_split=15, min_samples_leaf=5)),
    ('rf', RandomForestRegressor(random_state=0, n_estimators=200, max_depth=15, min_samples_split=2, min_samples_leaf=1, max_features='sqrt'))
]

meta_model = LinearRegression()

stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
stacking_model.fit(X_train, y_train)

y_train_pred = stacking_model.predict(X_train)
y_test_pred = stacking_model.predict(X_test)

print("\n=== Stacking Regression (Training Set) ===")
regression_metrics(y_train, y_train_pred)
print('-' * 50)
print("=== Stacking Regression (Test Set) ===")
regression_metrics(y_test, y_test_pred)


=== Stacking Regression (Training Set) ===
      MAE: 3308.25
      MSE: 37392741.10
      RMSE: 6114.96
      R²: 0.9924
--------------------------------------------------
=== Stacking Regression (Test Set) ===
      MAE: 6714.14
      MSE: 121101077.96
      RMSE: 11004.59
      R²: 0.9751
