In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import math
#from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import LabelEncoder   #####
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE

#import category_encoders as ce  #####
#from scipy.stats import skew

In [2]:
from WinesDatasetCleaning import wine_dataset_cleaning as wdc
wines_dataset = wdc()

In [3]:
# Dropping unnecessary columns:
wines_dataset.drop(columns=["country",
                            "apellation",
                            "taster_name",
                            "taster_twitter_handle",
                            "title",
                            "variety",
                            "winery",
                            "noble_international",
                            "monovarietal",
                            "taste_alcohol",
                            "primary_flavors",
                            "title_new"], inplace=True)

In [4]:
#wines_filtered = wines_dataset[wines_dataset["price_usd"]<=75]

In [5]:
iqr = np.percentile(wines_dataset["vintage"], 75)-np.percentile(wines_dataset["vintage"], 25)
lower_limit = np.percentile(wines_dataset["vintage"], 25)-3*iqr

In [6]:
wines_predictor_v0 = wines_dataset[wines_dataset["vintage"] >= lower_limit]

In [7]:
# Defining the dependent variable:
y_v0 = wines_predictor_v0["price_usd"]
wines_predictor_v0.drop(columns=["price_usd"], inplace=True)

In [8]:
# Defining the independent variables: 
X_num_v0 = wines_predictor_v0.select_dtypes(np.number)     # It takes all "numerical" variables
X_cat_v0 = wines_predictor_v0.select_dtypes(object)     # It takes all "categorical" variables

In [9]:
# Scaling "numerical" variables:
scaler_num_v0 = MinMaxScaler().fit_transform(X_num_v0)

In [10]:
# Getting dummies for "categorical" variables:
dummies_cat_v0 = pd.get_dummies(X_cat_v0, drop_first=True)

In [11]:
# Concatenating both "X_num" and "X_cat" variables:
X_v0 = np.concatenate((scaler_num_v0, dummies_cat_v0), axis=1)

In [12]:
# Splitting data between "train" and "test":
X_train, X_test, y_train, y_test = train_test_split(X_v0, y_v0, train_size=0.8, random_state=42)

# Linear Regression (Simple)_v0

In [13]:
# Defining the model:
lm = LinearRegression()

# Fitting the model:
lm.fit(X_train, y_train)

# Making predictions:
y_pred_lm = lm.predict(X_test)

# Evaluating performance of the model:
mse_lm = mean_squared_error(y_test, y_pred_lm)
r2_lm = r2_score(y_test, y_pred_lm)

# Printing the performance's results:
print("📘 Linear Regression (Simple):")
print(f"MSE (Mean Squared Error): {mse_lm:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lm:.4f}")

📘 Linear Regression (Simple):
MSE (Mean Squared Error): 1499.2795
R2 (Coefficient of Determination): 0.2004


# Linear Regression (Ridge)_v0

In [14]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_ridge = {
    "alpha": [0.01, 0.1, 1, 10, 100],
    "solver": ["auto", "sag", "lsqr"],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_ridge = GridSearchCV(
    estimator=Ridge(),     # Model we want to fit
    param_grid=param_grid_ridge,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_ridge.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Ridge's Best Parameters:", grid_search_ridge.best_params_)
print("Ridge's Best Score:", grid_search_ridge.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Ridge's Best Parameters: {'alpha': 0.1, 'max_iter': 1000, 'solver': 'sag'}
Ridge's Best Score: 0.20778486531774454


In [15]:
# Defining the model with the best parameters:
ridge = Ridge(alpha=0.1, max_iter=10000, solver="sag")

# Fitting the model:
ridge.fit(X_train, y_train)

# Making predictions:
y_pred_ridge = ridge.predict(X_test)

# Evaluating performance of the model:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Printing the performance's results:
print("📘 Linear Regression (Ridge):")
print(f"MSE (Mean Squared Error): {mse_ridge:.4f}")
print(f"R2 (Coefficient of Determination): {r2_ridge:.4f}")

📘 Linear Regression (Ridge):
MSE (Mean Squared Error): 1499.2794
R2 (Coefficient of Determination): 0.2004


# Linear Regression (Lasso)_v0

In [16]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_lasso = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "tol": [1e-4, 1e-5, 1e-6],
    "max_iter": [1000, 5000, 10000]}

# Creating GridSearchCV:
grid_search_lasso = GridSearchCV(
    estimator=Lasso(),     # Model we want to fit
    param_grid=param_grid_lasso,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_lasso.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("Lasso's Best Parameters:", grid_search_lasso.best_params_)
print("Lasso's Best Score:", grid_search_lasso.best_score_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Lasso's Best Parameters: {'alpha': 0.0001, 'max_iter': 1000, 'tol': 1e-06}
Lasso's Best Score: 0.20778439478417782


In [17]:
# Defining the model with the best parameters:
lasso = Lasso(alpha=0.0001, max_iter=1000, tol=0.000001)

# Fitting the model:
lasso.fit(X_train, y_train)

# Making predictions:
y_pred_lasso = lasso.predict(X_test)

# Evaluating performance of the model:
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Printing the performance's results:
print("📘 Linear Regression (Lasso):")
print(f"MSE (Mean Squared Error): {mse_lasso:.4f}")
print(f"R2 (Coefficient of Determination): {r2_lasso:.4f}")

📘 Linear Regression (Lasso):
MSE (Mean Squared Error): 1499.2809
R2 (Coefficient of Determination): 0.2004


# Support Vector Regressor_v0

In [None]:
""" # Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_svr = {
    "C": [0.1, 1, 10, 100],
    "epsilon": [0.001, 0.01, 0.1, 0.5],
    "kernel": ["rbf", "linear", "sigmoid"]}

# Creating GridSearchCV:
grid_search_svr = GridSearchCV(
    estimator=SVR(),     # Model we want to fit
    param_grid=param_grid_svr,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_svr.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("SVR's Best Parameters:", grid_search_svr.best_params_)
print("SVR's Best Score:", grid_search_svr.best_score_) """

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [18]:
# Defining the model with the best parameters:
svr = SVR(C=100, epsilon=0.5, kernel="rbf")

# Fitting the model:
svr.fit(X_train, y_train)

# Making predictions:
y_pred_svr = svr.predict(X_test)

# Evaluating performance of the model:
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Printing the performance's results:
print("📘 Support Vector Regression:")
print(f'MSE (Mean Squared Error): {mse_svr:.4f}')
print(f'R2 (Coefficient of Determination): {r2_svr:.4f}')

📘 Support Vector Regression:
MSE (Mean Squared Error): 1421.2532
R2 (Coefficient of Determination): 0.2420


# Random Forest Regressor

In [None]:
# Defining the parameter grid (check the model's documentation to know the exact names of the hyperparameters):
param_grid_rfr = {
    "n_estimators": [100, 300, 500],
    "max_depth": [10, 20],
    "min_samples_leaf": [1, 2, 4]}

# Creating GridSearchCV:
grid_search_rfr = GridSearchCV(
    estimator=RandomForestRegressor(),     # Model we want to fit
    param_grid=param_grid_rfr,     # Hyperparameters grid
    cv=5,     # Number of Cross-Validation folds
    scoring="r2",     # Scoring method we'll use to determine the best hyperparameters
	verbose=1)     # It prints the results at each step

# Performing Grid Search:
grid_search_rfr.fit(X_train, y_train)

# Returning Best parameters & Best score:
print("RFR's Best Parameters:", grid_search_rfr.best_params_)
print("RFR's Best Score:", grid_search_rfr.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [27]:
# Defining the model:
rfr = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=4, min_samples_split=5, bootstrap=True, max_depth=20, max_features="sqrt")       # Original -> n_estimators=100, random_state=42
# n_estimators=100, random_state=42, min_samples_leaf=4, min_samples_split=5, bootstrap=True, max_depth=20, max_features="sqrt"

# Fitting the model:
rfr.fit(X_train, y_train)

# Making predictions:
y_pred_rfr = rfr.predict(X_test)

# Evaluating performance of the model:
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

# Printing the performance's results:
print("📘 Random Forest Regression:")
print(f'MSE (Mean Squared Error): {mse_rfr:.4f}')
print(f'R2 (Coefficient of Determination): {r2_rfr:.4f}')

📘 Random Forest Regression:
MSE (Mean Squared Error): 140.3977
R2 (Coefficient of Determination): 0.4621
