In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import math

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE

In [6]:
from WinesDatasetCleaning import wine_dataset_cleaning as wdc
wines_dataset = wdc()

In [7]:
# Dropping unnecessary columns:
wines_dataset.drop(columns=["country",
                            "apellation",
                            "taster_name",
                            "taster_twitter_handle",
                            "title",
                            "variety",
                            "winery",
                            "noble_international",
                            "monovarietal",
                            "taste_alcohol",
                            "primary_flavors",
                            "title_new"], inplace=True)

In [8]:
iqr = np.percentile(wines_dataset["vintage"], 75)-np.percentile(wines_dataset["vintage"], 25)
lower_limit = np.percentile(wines_dataset["vintage"], 25)-3*iqr

In [9]:
wines_predictor_v0 = wines_dataset[wines_dataset["vintage"] >= lower_limit]

In [10]:
# Defining the dependent variable:
y_v0 = wines_predictor_v0["price_usd"]

In [11]:
# Defining the independent variables:
X_v0 = wines_predictor_v0.drop(columns=["price_usd"])

X_num_v0 = X_v0.select_dtypes(np.number)     # It takes all "numerical" variables
X_cat_v0 = X_v0.select_dtypes(object)     # It takes all "categorical" variables

In [12]:
# Scaling "numerical" variables:
scaler_num_v0 = MinMaxScaler().fit_transform(X_num_v0)

In [13]:
# Getting dummies for "categorical" variables:
dummies_cat_v0 = pd.get_dummies(X_cat_v0, drop_first=True)

In [14]:
# Concatenating both "X_num" and "X_cat" variables:
X_v0 = np.concatenate((scaler_num_v0, dummies_cat_v0), axis=1)

In [15]:
# Splitting data between "train" and "test":
X_train, X_test, y_train, y_test = train_test_split(X_v0, y_v0, train_size=0.8, random_state=42)

# Linear Regression (Simple)_v0

In [16]:
# Defining the model:
lm = LinearRegression()

# Fitting the model:
lm.fit(X_train, y_train)

# Making predictions:
y_pred_lm = lm.predict(X_test)

# Evaluating performance of the model:
mse_lm = mean_squared_error(y_test, y_pred_lm)
r2_lm = r2_score(y_test, y_pred_lm)

# Printing the performance's results:
print("📘 Linear Regression (Simple):")
print(f"MSE (Mean Squared Error): {mse_lm:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lm:.4f}")

📘 Linear Regression (Simple):
MSE (Mean Squared Error): 1499.2795
R2 (Coefficient of Determination): 0.2004


# Linear Regression (Ridge)_v0

In [17]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_ridge = {
    "alpha": [0.01, 0.1, 1, 10, 100],
    "solver": ["auto", "sag", "lsqr"],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_ridge = GridSearchCV(
    estimator=Ridge(),     # Model we want to fit
    param_grid=param_grid_ridge,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_ridge.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Ridge's Best Parameters:", grid_search_ridge.best_params_)
print("Ridge's Best Score:", grid_search_ridge.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Ridge's Best Parameters: {'alpha': 0.1, 'max_iter': 5000, 'solver': 'sag'}
Ridge's Best Score: 0.20778502764636383


In [18]:
# Defining the model with the best parameters:
ridge = Ridge(alpha=1, max_iter=1000, solver="sag")

# Fitting the model:
ridge.fit(X_train, y_train)

# Making predictions:
y_pred_ridge = ridge.predict(X_test)

# Evaluating performance of the model:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Printing the performance's results:
print("📘 Linear Regression (Ridge):")
print(f"MSE (Mean Squared Error): {mse_ridge:.4f}")
print(f"R2 (Coefficient of Determination): {r2_ridge:.4f}")

📘 Linear Regression (Ridge):
MSE (Mean Squared Error): 1499.2934
R2 (Coefficient of Determination): 0.2004


# Linear Regression (Lasso)_v0

In [19]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_lasso = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "tol": [1e-4, 1e-5, 1e-6],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_lasso = GridSearchCV(
    estimator=Lasso(),     # Model we want to fit
    param_grid=param_grid_lasso,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_lasso.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Lasso's Best Parameters:", grid_search_lasso.best_params_)
print("Lasso's Best Score:", grid_search_lasso.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Lasso's Best Parameters: {'alpha': 0.0001, 'max_iter': 1000, 'tol': 1e-06}
Lasso's Best Score: 0.20778439478417782


In [20]:
# Defining the model with the best parameters:
lasso = Lasso(alpha=0.0001, max_iter=1000, tol=0.000001)

# Fitting the model:
lasso.fit(X_train, y_train)

# Making predictions:
y_pred_lasso = lasso.predict(X_test)

# Evaluating performance of the model:
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Printing the performance's results:
print("📘 Linear Regression (Lasso):")
print(f"MSE (Mean Squared Error): {mse_lasso:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lasso:.4f}")

📘 Linear Regression (Lasso):
MSE (Mean Squared Error): 1499.2809
R2 (Coefficient of Determination): 0.2004


# Support Vector Regressor_v0

In [21]:
""" # Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_svr = {
    "C": [0.1, 1, 10, 100],
    "epsilon": [0.001, 0.01, 0.1, 0.5],
    "kernel": ["rbf", "linear", "sigmoid"]}

# Creating GridSearchCV:
grid_search_svr = GridSearchCV(
    estimator=SVR(),     # Model we want to fit
    param_grid=param_grid_svr,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_svr.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("SVR's Best Parameters:", grid_search_svr.best_params_)
print("SVR's Best Score:", grid_search_svr.best_score_) """

' # Defining the parameter grid (check the model\'s documentation to know the exact names of the hyperparameters):\nparam_grid_svr = {\n    "C": [0.1, 1, 10, 100],\n    "epsilon": [0.001, 0.01, 0.1, 0.5],\n    "kernel": ["rbf", "linear", "sigmoid"]}\n\n# Creating GridSearchCV:\ngrid_search_svr = GridSearchCV(\n    estimator=SVR(),     # Model we want to fit\n    param_grid=param_grid_svr,     # Hyperparameters grid\n    cv=5,     # Number of Cross-Validation folds\n    scoring="r2",     # Scoring method we\'ll use to determine the best hyperparameters\n\tverbose=1)     # It prints the results at each step\n\n# Performing Grid Search:\ngrid_search_svr.fit(X_train, y_train)\n\n# Returning Best parameters & Best score:\nprint("SVR\'s Best Parameters:", grid_search_svr.best_params_)\nprint("SVR\'s Best Score:", grid_search_svr.best_score_) '

In [22]:
# Defining the model with the best parameters:
svr = SVR(C=100, epsilon=0.5, kernel="rbf")

# Fitting the model:
svr.fit(X_train, y_train)

# Making predictions:
y_pred_svr = svr.predict(X_test)

# Evaluating performance of the model:
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Printing the performance's results:
print("📘 Support Vector Regression:")
print(f"MSE (Mean Squared Error): {mse_svr:.4f}")
print(f"R2 (Coefficient of Determination): {r2_svr:.4f}")

📘 Support Vector Regression:
MSE (Mean Squared Error): 1421.2532
R2 (Coefficient of Determination): 0.2420


# Random Forest Regressor_v0

In [23]:
""" # Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_rfr = {
    "n_estimators": [100, 300, 500],
    "max_depth": [10, 20],
    "min_samples_leaf": [1, 2, 4]}

# Creating GridSearchCV:
grid_search_rfr = GridSearchCV(
    estimator=RandomForestRegressor(),     # Model we want to fit
    param_grid=param_grid_rfr,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_rfr.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("RFR's Best Parameters:", grid_search_rfr.best_params_)
print("RFR's Best Score:", grid_search_rfr.best_score_) """

' # Defining the parameter grid (check the model\'s documentation to know the exact names of the hyperparameters):\nparam_grid_rfr = {\n    "n_estimators": [100, 300, 500],\n    "max_depth": [10, 20],\n    "min_samples_leaf": [1, 2, 4]}\n\n# Creating GridSearchCV:\ngrid_search_rfr = GridSearchCV(\n    estimator=RandomForestRegressor(),     # Model we want to fit\n    param_grid=param_grid_rfr,     # Hyperparameters grid\n    cv=5,     # Number of Cross-Validation folds\n    scoring="r2",     # Scoring method we\'ll use to determine the best hyperparameters\n\tverbose=1)     # It prints the results at each step\n\n# Performing Grid Search:\ngrid_search_rfr.fit(X_train, y_train)\n\n# Returning Best parameters & Best score:\nprint("RFR\'s Best Parameters:", grid_search_rfr.best_params_)\nprint("RFR\'s Best Score:", grid_search_rfr.best_score_) '

In [24]:
# Defining the model:
rfr = RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=4, random_state=42) 

# Fitting the model:
rfr.fit(X_train, y_train)

# Making predictions:
y_pred_rfr = rfr.predict(X_test)

# Evaluating performance of the model:
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

# Printing the performance's results:
print("📘 Random Forest Regression:")
print(f"MSE (Mean Squared Error): {mse_rfr:.4f}")
print(f"R2 (Coefficient of Determination): {r2_rfr:.4f}")

📘 Random Forest Regression:
MSE (Mean Squared Error): 1262.1406
R2 (Coefficient of Determination): 0.3269


# Applying VIF for Multicollinearity

In [25]:
wines_vif = wines_dataset.drop(columns=["wine_type"])

In [26]:
def calcular_vif(wines_vif):
    """Calcula el VIF para cada variable en el DataFrame."""
    vif_data = pd.DataFrame()
    vif_data["Variable"] = wines_vif.columns
    vif_data["VIF"] = [variance_inflation_factor(wines_vif.values, i) for i in range(wines_vif.shape[1])]
    return vif_data

def eliminar_variables_vif(wines_vif, umbral=10):
    """
    Elimina las variables con el mayor VIF de forma iterativa, 
    hasta que todas las variables tengan un VIF inferior al umbral.
    """
    iteracion = 1
    while True:
        print(f"\n--- Iteración {iteracion} ---")
        

        vif_data = calcular_vif(wines_vif)
        print(vif_data)
        
        max_vif = vif_data["VIF"].max()
        if max_vif < umbral:
            print("\n✅ Todas las variables tienen VIF menor que", umbral)
            break
        

        variable_a_eliminar = vif_data.loc[vif_data["VIF"].idxmax(), "Variable"]
        print(f"\n⚠️ Se eliminará la variable con mayor VIF: {variable_a_eliminar} (VIF={max_vif})")
        
        wines_vif = wines_vif.drop(columns=[variable_a_eliminar])
        
        iteracion += 1
        
    print("\n📊 Proceso finalizado. Variables restantes:")
    print(wines_vif.columns)
    return wines_vif, vif_data

data_final, vif_final = eliminar_variables_vif(wines_vif.drop(["price_usd"], axis=1), umbral=10)


--- Iteración 1 ---
           Variable          VIF
0            points   856.068218
1         avg_abv_%   261.263612
2  avg_serve_temp_c    73.843123
3   taste_dry-sweet    23.711092
4        taste_body    52.478985
5     taste_tannins    24.319699
6     taste_acidity    39.716134
7           vintage  1251.634174

⚠️ Se eliminará la variable con mayor VIF: vintage (VIF=1251.6341736933716)

--- Iteración 2 ---
           Variable         VIF
0            points  279.062722
1         avg_abv_%  192.465820
2  avg_serve_temp_c   73.788523
3   taste_dry-sweet   22.483527
4        taste_body   52.451080
5     taste_tannins   24.289661
6     taste_acidity   32.834957

⚠️ Se eliminará la variable con mayor VIF: points (VIF=279.06272182856196)

--- Iteración 3 ---
           Variable        VIF
0         avg_abv_%  69.958262
1  avg_serve_temp_c  73.219334
2   taste_dry-sweet  18.228289
3        taste_body  52.401812
4     taste_tannins  24.247313
5     taste_acidity  16.141179

⚠️ Se elimina

In [27]:
# Defining the dependent variable:
y_v2 = wines_predictor_v0["price_usd"]

In [28]:
# Defining the independent variables according to VIF:
X_v2 = wines_predictor_v0.drop(columns=["price_usd", "points", "avg_abv_%", "avg_serve_temp_c", "taste_body", "taste_acidity", "vintage"])

X_num_v2 = X_v2.select_dtypes(np.number)     # It takes all "numerical" variables
X_cat_v2 = X_v2.select_dtypes(object)     # It takes all "categorical" variables

In [29]:
# Scaling "numerical" variables:
scaler_num_v2 = MinMaxScaler().fit_transform(X_num_v2)

In [30]:
# Getting dummies for "categorical" variables:
dummies_cat_v2 = pd.get_dummies(X_cat_v2, drop_first=True)

In [31]:
# Concatenating both "X_num" and "X_cat" variables:
X_v2 = np.concatenate((scaler_num_v2, dummies_cat_v2), axis=1)

In [32]:
# Splitting data between "train" and "test":
X_train, X_test, y_train, y_test = train_test_split(X_v2, y_v2, train_size=0.8, random_state=42)

# Linear Regression (Simple)_v2

In [33]:
# Defining the model:
lm = LinearRegression()

# Fitting the model:
lm.fit(X_train, y_train)

# Making predictions:
y_pred_lm = lm.predict(X_test)

# Evaluating performance of the model:
mse_lm = mean_squared_error(y_test, y_pred_lm)
r2_lm = r2_score(y_test, y_pred_lm)

# Printing the performance's results:
print("📘 Linear Regression (Simple):")
print(f"MSE (Mean Squared Error): {mse_lm:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lm:.4f}")

📘 Linear Regression (Simple):
MSE (Mean Squared Error): 1813.5374
R2 (Coefficient of Determination): 0.0328


# Linear Regression (Ridge)_v2

In [34]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_ridge = {
    "alpha": [0.01, 0.1, 1, 10, 100],
    "solver": ["auto", "sag", "lsqr"],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_ridge = GridSearchCV(
    estimator=Ridge(),     # Model we want to fit
    param_grid=param_grid_ridge,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_ridge.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Ridge's Best Parameters:", grid_search_ridge.best_params_)
print("Ridge's Best Score:", grid_search_ridge.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Ridge's Best Parameters: {'alpha': 10, 'max_iter': 10000, 'solver': 'sag'}
Ridge's Best Score: 0.03137399208245171


In [35]:
# Defining the model with the best parameters:
ridge = Ridge(alpha=10, max_iter=1000, solver="sag")

# Fitting the model:
ridge.fit(X_train, y_train)

# Making predictions:
y_pred_ridge = ridge.predict(X_test)

# Evaluating performance of the model:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Printing the performance's results:
print("📘 Linear Regression (Ridge):")
print(f"MSE (Mean Squared Error): {mse_ridge:.4f}")
print(f"R2 (Coefficient of Determination): {r2_ridge:.4f}")

📘 Linear Regression (Ridge):
MSE (Mean Squared Error): 1813.5590
R2 (Coefficient of Determination): 0.0328


# Linear Regression (Lasso)_v2

In [36]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_lasso = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "tol": [1e-4, 1e-5, 1e-6],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_lasso = GridSearchCV(
    estimator=Lasso(),     # Model we want to fit
    param_grid=param_grid_lasso,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_lasso.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Lasso's Best Parameters:", grid_search_lasso.best_params_)
print("Lasso's Best Score:", grid_search_lasso.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Lasso's Best Parameters: {'alpha': 0.001, 'max_iter': 1000, 'tol': 1e-06}
Lasso's Best Score: 0.03137470788644492


In [37]:
# Defining the model with the best parameters:
lasso = Lasso(alpha=0.001, max_iter=1000, tol=0.000001)

# Fitting the model:
lasso.fit(X_train, y_train)

# Making predictions:
y_pred_lasso = lasso.predict(X_test)

# Evaluating performance of the model:
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Printing the performance's results:
print("📘 Linear Regression (Lasso):")
print(f"MSE (Mean Squared Error): {mse_lasso:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lasso:.4f}")

📘 Linear Regression (Lasso):
MSE (Mean Squared Error): 1813.5546
R2 (Coefficient of Determination): 0.0328


# Support Vector Regressor_v2

In [38]:
# Defining the model with the best parameters:
svr = SVR(C=100, epsilon=0.5, kernel="rbf")

# Fitting the model:
svr.fit(X_train, y_train)

# Making predictions:
y_pred_svr = svr.predict(X_test)

# Evaluating performance of the model:
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Printing the performance's results:
print("📘 Support Vector Regression:")
print(f'MSE (Mean Squared Error): {mse_svr:.4f}')
print(f'R2 (Coefficient of Determination): {r2_svr:.4f}')

📘 Support Vector Regression:
MSE (Mean Squared Error): 1845.1639
R2 (Coefficient of Determination): 0.0160


# Random Forest Regressor_v2

In [39]:
# Defining the model:
rfr = RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=4, random_state=42) 

# Fitting the model:
rfr.fit(X_train, y_train)

# Making predictions:
y_pred_rfr = rfr.predict(X_test)

# Evaluating performance of the model:
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

# Printing the performance's results:
print("📘 Random Forest Regression:")
print(f"MSE (Mean Squared Error): {mse_rfr:.4f}")
print(f"R2 (Coefficient of Determination): {r2_rfr:.4f}")

📘 Random Forest Regression:
MSE (Mean Squared Error): 1774.6060
R2 (Coefficient of Determination): 0.0536


# Trying a different approach...

In [40]:
wines_filtered = wines_dataset[wines_dataset["price_usd"]<=75]

In [41]:
iqr_filtered = np.percentile(wines_filtered["vintage"], 75)-np.percentile(wines_filtered["vintage"], 25)
lower_limit_filtered = np.percentile(wines_filtered["vintage"], 25)-3*iqr_filtered

In [42]:
wines_predictor_filtered = wines_filtered[wines_filtered["vintage"] >= lower_limit_filtered]

In [43]:
# Defining the dependent variable:
y_v3 = wines_predictor_filtered["price_usd"]

In [44]:
# Defining the independent variables:
X_v3 = wines_predictor_filtered.drop(columns=["price_usd"])

X_num_v3 = X_v3.select_dtypes(np.number)     # It takes all "numerical" variables
X_cat_v3 = X_v3.select_dtypes(object)     # It takes all "categorical" variables

In [45]:
# Scaling "numerical" variables:
scaler_num_v3 = MinMaxScaler().fit_transform(X_num_v3)

In [46]:
# Getting dummies for "categorical" variables:
dummies_cat_v3 = pd.get_dummies(X_cat_v3, drop_first=True)

In [47]:
# Concatenating both "X_num" and "X_cat" variables:
X_v3 = np.concatenate((scaler_num_v3, dummies_cat_v3), axis=1)

In [48]:
# Splitting data between "train" and "test":
X_train, X_test, y_train, y_test = train_test_split(X_v3, y_v3, train_size=0.8, random_state=42)

# Linear Regression (Simple)_v3

In [49]:
# Defining the model:
lm = LinearRegression()

# Fitting the model:
lm.fit(X_train, y_train)

# Making predictions:
y_pred_lm = lm.predict(X_test)

# Evaluating performance of the model:
mse_lm = mean_squared_error(y_test, y_pred_lm)
r2_lm = r2_score(y_test, y_pred_lm)

# Printing the performance's results:
print("📘 Linear Regression (Simple):")
print(f"MSE (Mean Squared Error): {mse_lm:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lm:.4f}")

📘 Linear Regression (Simple):
MSE (Mean Squared Error): 160.0123
R2 (Coefficient of Determination): 0.3869


# Linear Regression (Ridge)_v3

In [50]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_ridge = {
    "alpha": [0.01, 0.1, 1, 10, 100],
    "solver": ["auto", "sag", "lsqr"],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_ridge = GridSearchCV(
    estimator=Ridge(),     # Model we want to fit
    param_grid=param_grid_ridge,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_ridge.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Ridge's Best Parameters:", grid_search_ridge.best_params_)
print("Ridge's Best Score:", grid_search_ridge.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Ridge's Best Parameters: {'alpha': 0.1, 'max_iter': 10000, 'solver': 'sag'}
Ridge's Best Score: 0.3779636691090883


In [51]:
# Defining the model with the best parameters:
ridge = Ridge(alpha=1, max_iter=1000, solver="sag")

# Fitting the model:
ridge.fit(X_train, y_train)

# Making predictions:
y_pred_ridge = ridge.predict(X_test)

# Evaluating performance of the model:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Printing the performance's results:
print("📘 Linear Regression (Ridge):")
print(f"MSE (Mean Squared Error): {mse_ridge:.4f}")
print(f"R2 (Coefficient of Determination): {r2_ridge:.4f}")

📘 Linear Regression (Ridge):
MSE (Mean Squared Error): 160.0136
R2 (Coefficient of Determination): 0.3869


# Linear Regression (Lasso)_v3

In [52]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_lasso = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "tol": [1e-4, 1e-5, 1e-6],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_lasso = GridSearchCV(
    estimator=Lasso(),     # Model we want to fit
    param_grid=param_grid_lasso,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_lasso.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Lasso's Best Parameters:", grid_search_lasso.best_params_)
print("Lasso's Best Score:", grid_search_lasso.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Lasso's Best Parameters: {'alpha': 0.0001, 'max_iter': 1000, 'tol': 1e-06}
Lasso's Best Score: 0.3779631483107007


In [53]:
# Defining the model with the best parameters:
lasso = Lasso(alpha=0.0001, max_iter=1000, tol=0.000001)

# Fitting the model:
lasso.fit(X_train, y_train)

# Making predictions:
y_pred_lasso = lasso.predict(X_test)

# Evaluating performance of the model:
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Printing the performance's results:
print("📘 Linear Regression (Lasso):")
print(f"MSE (Mean Squared Error): {mse_lasso:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lasso:.4f}")

📘 Linear Regression (Lasso):
MSE (Mean Squared Error): 160.0126
R2 (Coefficient of Determination): 0.3869


# Support Vector Regressor_v3

In [54]:
# Defining the model with the best parameters:
svr = SVR(C=100, epsilon=0.5, kernel="rbf")

# Fitting the model:
svr.fit(X_train, y_train)

# Making predictions:
y_pred_svr = svr.predict(X_test)

# Evaluating performance of the model:
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Printing the performance's results:
print("📘 Support Vector Regression:")
print(f'MSE (Mean Squared Error): {mse_svr:.4f}')
print(f'R2 (Coefficient of Determination): {r2_svr:.4f}')

📘 Support Vector Regression:
MSE (Mean Squared Error): 146.1026
R2 (Coefficient of Determination): 0.4402


# Random Forest Regressor_v3

In [55]:
# Defining the model:
rfr = RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=4, random_state=42) 

# Fitting the model:
rfr.fit(X_train, y_train)

# Making predictions:
y_pred_rfr = rfr.predict(X_test)

# Evaluating performance of the model:
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

# Printing the performance's results:
print("📘 Random Forest Regression:")
print(f"MSE (Mean Squared Error): {mse_rfr:.4f}")
print(f"R2 (Coefficient of Determination): {r2_rfr:.4f}")

📘 Random Forest Regression:
MSE (Mean Squared Error): 141.4852
R2 (Coefficient of Determination): 0.4579


# Trying a different approach_v2...

In [56]:
wines_filtered_v2 = wines_dataset[wines_dataset["price_usd"]<=50]

In [57]:
iqr_filtered_v2 = np.percentile(wines_filtered_v2["vintage"], 75)-np.percentile(wines_filtered_v2["vintage"], 25)
lower_limit_filtered_v2 = np.percentile(wines_filtered_v2["vintage"], 25)-3*iqr_filtered_v2

In [58]:
wines_predictor_filtered_v2 = wines_filtered_v2[wines_filtered_v2["vintage"] >= lower_limit_filtered_v2]

In [59]:
# Defining the dependent variable:
y_v4 = wines_predictor_filtered_v2["price_usd"]

In [60]:
# Defining the independent variables:
X_v4 = wines_predictor_filtered_v2.drop(columns=["price_usd"])

X_num_v4 = X_v4.select_dtypes(np.number)     # It takes all "numerical" variables
X_cat_v4 = X_v4.select_dtypes(object)     # It takes all "categorical" variables

In [61]:
# Scaling "numerical" variables:
scaler_num_v4 = MinMaxScaler().fit_transform(X_num_v4)

In [62]:
# Getting dummies for "categorical" variables:
dummies_cat_v4 = pd.get_dummies(X_cat_v4, drop_first=True)

In [63]:
# Concatenating both "X_num" and "X_cat" variables:
X_v4 = np.concatenate((scaler_num_v4, dummies_cat_v4), axis=1)

In [64]:
# Splitting data between "train" and "test":
X_train, X_test, y_train, y_test = train_test_split(X_v4, y_v4, train_size=0.8, random_state=42)

# Linear Regression (Simple)_v4

In [65]:
# Defining the model:
lm = LinearRegression()

# Fitting the model:
lm.fit(X_train, y_train)

# Making predictions:
y_pred_lm = lm.predict(X_test)

# Evaluating performance of the model:
mse_lm = mean_squared_error(y_test, y_pred_lm)
r2_lm = r2_score(y_test, y_pred_lm)

# Printing the performance's results:
print("📘 Linear Regression (Simple):")
print(f"MSE (Mean Squared Error): {mse_lm:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lm:.4f}")

📘 Linear Regression (Simple):
MSE (Mean Squared Error): 86.8623
R2 (Coefficient of Determination): 0.3287


# Linear Regression (Ridge)_v4

In [66]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_ridge = {
    "alpha": [0.01, 0.1, 1, 10, 100],
    "solver": ["auto", "sag", "lsqr"],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_ridge = GridSearchCV(
    estimator=Ridge(),     # Model we want to fit
    param_grid=param_grid_ridge,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_ridge.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Ridge's Best Parameters:", grid_search_ridge.best_params_)
print("Ridge's Best Score:", grid_search_ridge.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Ridge's Best Parameters: {'alpha': 0.01, 'max_iter': 5000, 'solver': 'sag'}
Ridge's Best Score: 0.33413331962796


In [67]:
# Defining the model with the best parameters:
ridge = Ridge(alpha=0.01, max_iter=5000, solver="sag")

# Fitting the model:
ridge.fit(X_train, y_train)

# Making predictions:
y_pred_ridge = ridge.predict(X_test)

# Evaluating performance of the model:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Printing the performance's results:
print("📘 Linear Regression (Ridge):")
print(f"MSE (Mean Squared Error): {mse_ridge:.4f}")
print(f"R2 (Coefficient of Determination): {r2_ridge:.4f}")

📘 Linear Regression (Ridge):
MSE (Mean Squared Error): 86.8623
R2 (Coefficient of Determination): 0.3287


# Linear Regression (Lasso)_v4

In [68]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_lasso = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "tol": [1e-4, 1e-5, 1e-6],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_lasso = GridSearchCV(
    estimator=Lasso(),     # Model we want to fit
    param_grid=param_grid_lasso,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_lasso.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Lasso's Best Parameters:", grid_search_lasso.best_params_)
print("Lasso's Best Score:", grid_search_lasso.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Lasso's Best Parameters: {'alpha': 0.0001, 'max_iter': 1000, 'tol': 1e-06}
Lasso's Best Score: 0.3341317557187241


In [69]:
# Defining the model with the best parameters:
lasso = Lasso(alpha=0.0001, max_iter=1000, tol=0.000001)

# Fitting the model:
lasso.fit(X_train, y_train)

# Making predictions:
y_pred_lasso = lasso.predict(X_test)

# Evaluating performance of the model:
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Printing the performance's results:
print("📘 Linear Regression (Lasso):")
print(f"MSE (Mean Squared Error): {mse_lasso:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lasso:.4f}")

📘 Linear Regression (Lasso):
MSE (Mean Squared Error): 86.8612
R2 (Coefficient of Determination): 0.3287


# Support Vector Regressor_v4

In [70]:
# Defining the model with the best parameters:
svr = SVR(C=100, epsilon=0.5, kernel="rbf")

# Fitting the model:
svr.fit(X_train, y_train)

# Making predictions:
y_pred_svr = svr.predict(X_test)

# Evaluating performance of the model:
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Printing the performance's results:
print("📘 Support Vector Regression:")
print(f'MSE (Mean Squared Error): {mse_svr:.4f}')
print(f'R2 (Coefficient of Determination): {r2_svr:.4f}')

📘 Support Vector Regression:
MSE (Mean Squared Error): 79.9075
R2 (Coefficient of Determination): 0.3824


# Random Forest Regressor_v4

In [71]:
# Defining the model:
rfr = RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=4, random_state=42) 

# Fitting the model:
rfr.fit(X_train, y_train)

# Making predictions:
y_pred_rfr = rfr.predict(X_test)

# Evaluating performance of the model:
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

# Printing the performance's results:
print("📘 Random Forest Regression:")
print(f"MSE (Mean Squared Error): {mse_rfr:.4f}")
print(f"R2 (Coefficient of Determination): {r2_rfr:.4f}")

📘 Random Forest Regression:
MSE (Mean Squared Error): 77.8404
R2 (Coefficient of Determination): 0.3984
