### Preparing

In [1]:
# !pip install featuretools

In [2]:
# from pandas.core.arrays.sparse.array import NaT
# import matplotlib.pyplot as plt
# import featuretools as ft
import pickle
import numpy as np
import pandas as pd
# from matplotlib.container import BarContainer
# from matplotlib.axes import Axes
# from pandas.core.groupby import DataFrameGroupBy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, Lasso
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import seaborn as sns
# import plotly.express as px


# np.random.seed(seed=1729)


def save_model_to_drive(model, model_name: str):
    # model_name: str = f"{model=}".split('=')[0]

    filename: str = f"{model_name}.pkl"

    with open(filename, "wb") as file:
        pickle.dump(model, file, pickle.HIGHEST_PROTOCOL)

def load_model(model_name):
    with open(model_name, "rb") as file:
      return pickle.load(file)


def mount_google_drive() -> bool | None:
    """
    Function to mount Google Drive.
    :return: True if mounting is successful, None otherwise.
    """
    try:
        from google.colab.drive import mount

        mount(mountpoint="/content/drive")
        return True

    except Exception as error:
        print(f"Error while mounting Google Drive: {error}")
        raise


def get_google_drive_dataset_path() -> tuple:
    """
    Function to retrieve the path of the raw dataset from Google Drive.
    :return: Path of raw dataset as a string.
    """
    return (
        "/content/drive/MyDrive/Hillel/Machine_Learning_Course/HW5/winequality-red.csv",
        "/content/drive/MyDrive/Hillel/Machine_Learning_Course/HW5/winequality-white.csv"
    )


def get_data_frame(dataset_path: str) -> tuple[pd.DataFrame] | None:
    """
    Function to convert the dataset into a pd.DataFrame.
    :param dataset_path: Path or URL of the dataset.
    :return: pd.DataFrame containing the dataset, or None if errors occur.
    """
    try:
        # return pd.read_csv(filepath_or_buffer=dataset_path)
        return pd.read_csv(filepath_or_buffer=dataset_path, sep=";")

    except Exception as error:
        print(f"Error while converting dataset to NumPy array: {error}")
        raise


def main() -> tuple[pd.DataFrame] | None:
    """
    Main function to start the app.
    :return: pd.DataFrame containing the dataset, or None if errors occur.
    """
    if mount_google_drive():
        import os

        datasets_paths: tuple = get_google_drive_dataset_path()
        if all(map(os.path.exists, datasets_paths)):
            data_frames: tuple = tuple(get_data_frame(dataset_path=path) for path in datasets_paths)

            return data_frames

        else:
            print("Dataset path doesn't exists.")
            raise FileNotFoundError


if __name__ == "__main__":
    raw_data_frames: tuple | None = main()
    if raw_data_frames:
      red_winequality_dataframe: pd.DataFrame = raw_data_frames[0]
      white_winequality_dataframe: pd.DataFrame = raw_data_frames[1]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def generate_features(features_list: list, dataframe: pd.DataFrame) -> pd.DataFrame:
  featured_dataframe: pd.DataFrame = dataframe.copy(deep=True)
  new_columns: list = []

  for feature1 in features_list:
    for feature2 in features_list:
      if feature1 == feature2:
        continue
      new_columns.append(pd.Series(data=(featured_dataframe[feature1] + featured_dataframe[feature2]), name=f"{feature1}+{feature2}"))
      new_columns.append(pd.Series(data=(featured_dataframe[feature1] * featured_dataframe[feature2]), name=f"{feature1}*{feature2}"))
      new_columns.append(pd.Series(data=(featured_dataframe[feature1] - featured_dataframe[feature2]), name=f"{feature1}_-_{feature2}"))
      new_columns.append(pd.Series(data=(featured_dataframe[feature1] / featured_dataframe[feature2]), name=f"{feature1}/{feature2}"))
      new_columns.append(pd.Series(data=(featured_dataframe[feature1].astype(bool) & featured_dataframe[feature2].astype(bool)), name=f"{feature1}_AND_{feature2}"))
      new_columns.append(pd.Series(data=(featured_dataframe[feature1].astype(bool) | featured_dataframe[feature2].astype(bool)), name=f"{feature1}_OR_{feature2}"))

  new_cols_df: pd.DataFrame = pd.concat(new_columns, axis=1)
  featured_dataframe: pd.DataFrame = pd.concat([featured_dataframe, new_cols_df], axis=1)

  return featured_dataframe


Для датасету

https://archive.ics.uci.edu/ml/datasets/wine+quality

побудувати **модель лінійної регресії**

Обов'язкові кроки:

*   первинний аналіз даних (відстуність пропусків, наявність категоріальних фіч, ...)  (+)
*   фича інжиніринг (побудувати 1-2 нові фічі) *5-10  (+)
*   масштабування фіч  (+)
*   поділ датасету на: (+)
  1. тренувальну,
  2. валідаційну
  3. тестову частини
*   тренування базової моделі із дефолтними гіперпараметрами (кожну модель) (+)
*   підбір гіперпараметрів (кожну модель) (+)
*   оцінка результатів (порівняння всіх на тестовій частині) !!!
*   порівняти коефіцієнти на стохастичному град спуску, і на алгебраїчному рішенні (+)
*   побудувати модель різними способами (GD, SGD, MBGD... perceptron, ...) (+)
*   метрики (+)



### Red Wine

In [4]:
print("'"*100)
# red_winequality_dataframe.isnull().values.any()
red_winequality_dataframe.info()
print("'"*100)
red_winequality_dataframe.head(10)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
'''''''''''''''''''''''''

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [5]:
main_columns: list = list(red_winequality_dataframe.columns.values)

### Создаэмо новий датасет з нагенерованими фічами:
new_red_wine_df: pd.DataFrame = generate_features(features_list=main_columns[:-1], dataframe=red_winequality_dataframe)
new_columns: list = list(new_red_wine_df.columns.values)

### Перевіряємо кореляцію, і видідяємо найзначніші фічі:
# correlations: pd.Series = new_red_wine_df.corr()["quality"]
# sorted_correlations: pd.Series = correlations.abs().sort_values(ascending=False)
# top_features: pd.Index = sorted_correlations.index[:11]

### визначили найзначніші фічі:
important_columns: list = ["volatile acidity_-_alcohol", "alcohol+sulphates", "citric acid+alcohol",
                           "alcohol/pH", "pH_-_alcohol", "pH/alcohol", "alcohol*density", "chlorides_-_alcohol", "alcohol+density"]

### залишаємо тільки дефолні, та нові найзначніші фічі які більш всього корелюють на таргет, залишок видаляємо.
columns_to_delete: list = set(new_columns) - (set(main_columns + important_columns))
new_red_wine_df: pd.DataFrame = new_red_wine_df.drop(labels=columns_to_delete, axis=1)
new_red_wine_df.info()

### Heatmap
new_red_wine_df_matrix = new_red_wine_df.corr()
# sns.heatmap(data=new_red_wine_df_matrix)
# sns.heatmap(data=new_red_wine_df_matrix, annot=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   fixed acidity               1599 non-null   float64
 1   volatile acidity            1599 non-null   float64
 2   citric acid                 1599 non-null   float64
 3   residual sugar              1599 non-null   float64
 4   chlorides                   1599 non-null   float64
 5   free sulfur dioxide         1599 non-null   float64
 6   total sulfur dioxide        1599 non-null   float64
 7   density                     1599 non-null   float64
 8   pH                          1599 non-null   float64
 9   sulphates                   1599 non-null   float64
 10  alcohol                     1599 non-null   float64
 11  quality                     1599 non-null   int64  
 12  volatile acidity_-_alcohol  1599 non-null   float64
 13  citric acid+alcohol         1599 

In [37]:
### Split dataset
red_wine_df_train_part, red_wine_df_test_validation_part = train_test_split(new_red_wine_df, test_size=0.2)
red_wine_df_validation_part, red_wine_df_test_part = train_test_split(red_wine_df_test_validation_part, test_size=0.33)

# red_wine_df_train_part.info()
# red_wine_df_validation_part.info()
# red_wine_df_test_part.info()

In [38]:
### FEATURES (X), and TARGET (Y)
train_features_X: pd.DataFrame = red_wine_df_train_part.drop("quality", axis=1)
train_target_y: pd.Series = red_wine_df_train_part["quality"]

validate_features_X: pd.DataFrame = red_wine_df_validation_part.drop("quality", axis=1)
validate_target_y: pd.Series = red_wine_df_validation_part["quality"]

test_features_X: pd.DataFrame = red_wine_df_test_part.drop("quality", axis=1)
test_target_y: pd.Series = red_wine_df_test_part["quality"]

In [39]:
### Common function to create models

def create_model(regressor, name:str, scaler, partial: bool = False):
  print("'"*50)
  print(f"         {name}")

  if partial:
    model = regressor.partial_fit(X=train_features_X, y=train_target_y)

  else:
    model: Pipeline = make_pipeline(scaler, regressor)
    model.fit(X=train_features_X, y=train_target_y)

  ## Attributes
  print(f"                     Score: {model.score(X=train_features_X, y=train_target_y)}")
  print(f"                     ''''''''''''''''''''''''''")

  ## Predict
  model_predict = model.predict(X=validate_features_X)

  ## Metrics
  model_r2: np.float64 = r2_score(y_true=validate_target_y, y_pred=model_predict)
  print(f"                        R2: {model_r2}")

  model_mean_squared_error: np.float64 = mean_squared_error(y_true=validate_target_y, y_pred=model_predict)
  print(f"  Mean_squared_error (MSE): {model_mean_squared_error}")

  model_mean_absolute_error: np.float64 = mean_absolute_error(y_true=validate_target_y, y_pred=model_predict)
  print(f" Mean_absolute_error (MAE): {model_mean_absolute_error}")

  print("'"*50)
  return model

  # save_model_to_drive(model=model, model_name=f"{str(model_r2)[:6]}{name}")


In [40]:
def value_results(model, name:str):
  print("'"*50)
  print(name)
  model_predict = model.predict(X=test_features_X)

  ## Metrics
  model_r2: np.float64 = r2_score(y_true=test_target_y, y_pred=model_predict)
  print(f"                        R2: {model_r2}")

  model_mean_squared_error: np.float64 = mean_squared_error(y_true=test_target_y, y_pred=model_predict)
  print(f"  Mean_squared_error (MSE): {model_mean_squared_error}")

  model_mean_absolute_error: np.float64 = mean_absolute_error(y_true=test_target_y, y_pred=model_predict)
  print(f" Mean_absolute_error (MAE): {model_mean_absolute_error}")
  print("'"*50)

In [41]:
## SCALER
red_wine_min_max_scaler: MinMaxScaler = MinMaxScaler().fit(X=red_wine_df_train_part)
# red_wine_standard_scaler: StandardScaler = StandardScaler().fit(X=red_wine_df_train_part)


In [42]:
### ****** LINEARREGRESSION. FOR RED WINE ******
red_wine_linear_clean_regressor: LinearRegression = create_model(
    regressor=LinearRegression(),
    name="red_wine_linear_clean_regressor",
    scaler=red_wine_min_max_scaler
)

### ****** LINEARREGRESSION WITH HYPER PARAMS. FOR RED WINE ******
red_wine_linear_hyper_params_regressor: LinearRegression = create_model(
    # regressor=LinearRegression(n_jobs=-1),
    # regressor=LinearRegression(positive=True),
    regressor=LinearRegression(fit_intercept=False),
    name="red_wine_linear_hyper_params_regressor",
    scaler=red_wine_min_max_scaler
)

### ****** SGDRegressor. FOR RED WINE ******
red_wine_clean_sgdegressor: SGDRegressor = create_model(
    regressor=SGDRegressor(),
    name="red_wine_clean_sgdegressor",
    scaler=red_wine_min_max_scaler
)

### ****** SGDRegressor WITH HYPER PARAMS. FOR RED WINE ******
red_wine_hyper_params_sgdegressor: SGDRegressor = create_model(
    regressor=SGDRegressor(loss="squared_epsilon_insensitive", penalty="elasticnet", tol=1e-5, n_iter_no_change=15), # better than clean
    name="red_wine_hyper_params_sgdegressor",
    scaler=red_wine_min_max_scaler
)

### ****** MINI BATCH SGDRegressor. FOR RED WINE ******
red_wine_clean_mb_sgdegressor: SGDRegressor = create_model(
    regressor=SGDRegressor(),
    name="red_wine_clean_mb_sgdegressor",
    scaler=red_wine_min_max_scaler,
    partial=True
)

### ****** MINI BATCH SGDRegressor WITH HYPER PARAMS. FOR RED WINE ******
red_wine_hyper_params_mb_sgdegressor: SGDRegressor = create_model(
    regressor=SGDRegressor(loss="squared_epsilon_insensitive", penalty="elasticnet", tol=1e-5, n_iter_no_change=15),
    name="red_wine_hyper_params_mb_sgdegressor",
    scaler=red_wine_min_max_scaler,
    partial=True
)

### ****** ElasticNet. FOR RED WINE ******
red_wine_clean_elastic_net: ElasticNet = create_model(
    regressor=ElasticNet(),
    name="red_wine_clean_elastic_net",
    scaler=red_wine_min_max_scaler
)

### ****** ElasticNet WITH HYPER PARAMS. FOR RED WINE ******
red_wine_hyper_params_elastic_net: ElasticNet = create_model(
    regressor=ElasticNet(tol=1e-8, l1_ratio=0.2),
    name="red_wine_hyper_params_elastic_net",
    scaler=red_wine_min_max_scaler
)

### ****** Lasso. FOR RED WINE ******
red_wine_clean_l1: Lasso = create_model(
    regressor=Lasso(),
    name="red_wine_clean_l1",
    scaler=red_wine_min_max_scaler
)

### ****** Lasso WITH HYPER PARAMS. FOR RED WINE ******
red_wine_hyper_params_l1: Lasso = create_model(
    regressor=Lasso(alpha=0.5, tol=1e-6),
    name="red_wine_hyper_params_l1",
    scaler=red_wine_min_max_scaler
)

''''''''''''''''''''''''''''''''''''''''''''''''''
         red_wine_linear_clean_regressor
                     Score: 0.3627738574857109
                     ''''''''''''''''''''''''''
                        R2: 0.3664815104131467
  Mean_squared_error (MSE): 0.5117284471743575
 Mean_absolute_error (MAE): 0.5348276869158879
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
         red_wine_linear_hyper_params_regressor
                     Score: 0.36224433473168616
                     ''''''''''''''''''''''''''
                        R2: 0.36249929195695196
  Mean_squared_error (MSE): 0.5149451085668711
 Mean_absolute_error (MAE): 0.5356797278001729
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
         red_wine_clean_sgdegressor
                     Score: 0.2580285130810921
                     ''''''''''''''''''''''''''
                        R2: 0.29015800629054034
  M

In [12]:
### Оцінка результатів
value_results(model=red_wine_linear_clean_regressor, name="red_wine_linear_clean_regressor")
value_results(model=red_wine_linear_hyper_params_regressor, name="red_wine_linear_hyper_params_regressor")
value_results(model=red_wine_clean_sgdegressor, name="red_wine_clean_sgdegressor")
value_results(model=red_wine_hyper_params_sgdegressor, name="red_wine_hyper_params_sgdegressor")
value_results(model=red_wine_clean_mb_sgdegressor, name="red_wine_clean_mb_sgdegressor")
value_results(model=red_wine_hyper_params_mb_sgdegressor, name="red_wine_hyper_params_mb_sgdegressor")
value_results(model=red_wine_clean_elastic_net, name="red_wine_clean_elastic_net")
value_results(model=red_wine_hyper_params_elastic_net, name="red_wine_hyper_params_elastic_net")
value_results(model=red_wine_clean_l1, name="red_wine_clean_l1")
value_results(model=red_wine_hyper_params_l1, name="red_wine_hyper_params_l1")

''''''''''''''''''''''''''''''''''''''''''''''''''
red_wine_linear_clean_regressor
                        R2: 0.3984867888283543
  Mean_squared_error (MSE): 0.44460370405664984
 Mean_absolute_error (MAE): 0.4891288325471698
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
red_wine_linear_hyper_params_regressor
                        R2: 0.3951767260981085
  Mean_squared_error (MSE): 0.44705031058697126
 Mean_absolute_error (MAE): 0.4899935267119793
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
red_wine_clean_sgdegressor
                        R2: 0.31239976794827007
  Mean_squared_error (MSE): 0.5082342405829137
 Mean_absolute_error (MAE): 0.5120568300898956
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
red_wine_hyper_params_sgdegressor
                        R2: 0.40102223077767063
  Mean_squared_error (MSE): 0.4427296

### White Wine

In [13]:
print("'"*100)
# white_winequality_dataframe.isnull().values.any()
white_winequality_dataframe.info()
print("'"*100)
white_winequality_dataframe.head(10)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
'''''''''''''''''''''''''

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6


In [14]:
w_main_columns: list = list(white_winequality_dataframe.columns.values)

### Создаэмо новий датасет з нагенерованими фічами:
new_white_wine_df: pd.DataFrame = generate_features(features_list=w_main_columns[:-1], dataframe=white_winequality_dataframe)
new_columns: list = list(new_white_wine_df.columns.values)

### Перевіряємо кореляцію, і видідяємо найзначніші фічі:
# correlations: pd.Series = new_white_wine_df.corr()["quality"]
# sorted_correlations: pd.Series = correlations.abs().sort_values(ascending=False)
# top_features: pd.Index = sorted_correlations.index[:25]
# print(top_features)

# визначили найзначніші фічі:
w_important_columns: list = ["alcohol_-_volatile acidity", "sulphates+alcohol", "alcohol+pH",
                           "chlorides_-_alcohol", "density*alcohol", "alcohol+citric acid", "density+alcohol"]


### залишаємо тільки дефолні, та нові найзначніші фічі які більш всього корелюють на таргет, залишок видаляємо.
columns_to_delete: list = set(new_columns) - (set(w_main_columns + w_important_columns))
new_white_wine_df: pd.DataFrame = new_white_wine_df.drop(labels=columns_to_delete, axis=1)
new_white_wine_df.info()

### Heatmap
new_white_wine_df_matrix = new_white_wine_df.corr()
# sns.heatmap(data=new_white_wine_df_matrix)
# sns.heatmap(data=new_white_wine_df_matrix, annot=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   fixed acidity               4898 non-null   float64
 1   volatile acidity            4898 non-null   float64
 2   citric acid                 4898 non-null   float64
 3   residual sugar              4898 non-null   float64
 4   chlorides                   4898 non-null   float64
 5   free sulfur dioxide         4898 non-null   float64
 6   total sulfur dioxide        4898 non-null   float64
 7   density                     4898 non-null   float64
 8   pH                          4898 non-null   float64
 9   sulphates                   4898 non-null   float64
 10  alcohol                     4898 non-null   float64
 11  quality                     4898 non-null   int64  
 12  chlorides_-_alcohol         4898 non-null   float64
 13  density+alcohol             4898 

In [32]:
### Split dataset
white_wine_df_train_part, white_wine_df_test_validation_part = train_test_split(new_white_wine_df, test_size=0.1)
white_wine_df_validation_part, white_wine_df_test_part = train_test_split(white_wine_df_test_validation_part, test_size=0.33)

# white_wine_df_train_part.info()
# white_wine_df_validation_part.info()
# white_wine_df_test_part.info()

In [33]:
### FEATURES (X), and TARGET (Y)
w_train_features_X: pd.DataFrame = white_wine_df_train_part.drop("quality", axis=1)
w_train_target_y: pd.Series = white_wine_df_train_part["quality"]

w_validate_features_X: pd.DataFrame = white_wine_df_validation_part.drop("quality", axis=1)
w_validate_target_y: pd.Series = white_wine_df_validation_part["quality"]

w_test_features_X: pd.DataFrame = white_wine_df_test_part.drop("quality", axis=1)
w_test_target_y: pd.Series = white_wine_df_test_part["quality"]

In [34]:
### Common function to create models

def w_create_model(regressor, name:str, scaler, partial: bool = False):
  print("'"*50)
  print(f"         {name}")

  if partial:
    model = regressor.partial_fit(X=w_train_features_X, y=w_train_target_y)

  else:
    model: Pipeline = make_pipeline(scaler, regressor)
    model.fit(X=w_train_features_X, y=w_train_target_y)

  ## Attributes
  print(f"                     Score: {model.score(X=w_train_features_X, y=w_train_target_y)}")
  print(f"                     ''''''''''''''''''''''''''")

  ## Predict
  model_predict = model.predict(X=w_validate_features_X)
  ## Metrics
  model_r2: np.float64 = r2_score(y_true=w_validate_target_y, y_pred=model_predict)
  print(f"                        R2: {model_r2}")

  model_mean_squared_error: np.float64 = mean_squared_error(y_true=w_validate_target_y, y_pred=model_predict)
  print(f"  Mean_squared_error (MSE): {model_mean_squared_error}")

  model_mean_absolute_error: np.float64 = mean_absolute_error(y_true=w_validate_target_y, y_pred=model_predict)
  print(f" Mean_absolute_error (MAE): {model_mean_absolute_error}")

  print("'"*50)
  return model

  # save_model_to_drive(model=model, model_name=f"{str(model_r2)[:6]}{name}")

In [35]:
## SCALER
white_wine_min_max_scaler: MinMaxScaler = MinMaxScaler().fit(X=white_wine_df_train_part)
# white_wine_standard_scaler: StandardScaler = StandardScaler().fit(X=white_wine_df_train_part)


In [36]:
### ****** LINEARREGRESSION. FOR white WINE ******
white_wine_linear_clean_regressor: LinearRegression = w_create_model(
    regressor=LinearRegression(),
    name="white_wine_linear_clean_regressor",
    scaler=white_wine_min_max_scaler
)

### ****** LINEARREGRESSION WITH HYPER PARAMS. FOR white WINE ******
white_wine_linear_hyper_params_regressor: LinearRegression = w_create_model(
    # regressor=LinearRegression(n_jobs=-1),
    # regressor=LinearRegression(positive=True),
    regressor=LinearRegression(fit_intercept=False),
    name="white_wine_linear_hyper_params_regressor",
    scaler=white_wine_min_max_scaler
)

### ****** SGDRegressor. FOR white WINE ******
white_wine_clean_sgdegressor: SGDRegressor = w_create_model(
    regressor=SGDRegressor(),
    name="white_wine_clean_sgdegressor",
    scaler=white_wine_min_max_scaler
)

### ****** SGDRegressor WITH HYPER PARAMS. FOR white WINE ******
white_wine_hyper_params_sgdegressor: SGDRegressor = w_create_model(
    regressor=SGDRegressor(loss="squared_epsilon_insensitive", penalty="elasticnet", tol=1e-5, n_iter_no_change=15), # better than clean
    name="white_wine_hyper_params_sgdegressor",
    scaler=white_wine_min_max_scaler
)

### ****** MINI BATCH SGDRegressor. FOR white WINE ******
white_wine_clean_mb_sgdegressor: SGDRegressor = w_create_model(
    regressor=SGDRegressor(),
    name="white_wine_clean_mb_sgdegressor",
    scaler=white_wine_min_max_scaler,
    partial=True
)

### ****** MINI BATCH SGDRegressor WITH HYPER PARAMS. FOR white WINE ******
white_wine_hyper_params_mb_sgdegressor: SGDRegressor = w_create_model(
    regressor=SGDRegressor(loss="squared_epsilon_insensitive", penalty="elasticnet", tol=1e-5, n_iter_no_change=15),
    name="white_wine_hyper_params_mb_sgdegressor",
    scaler=white_wine_min_max_scaler,
    partial=True
)

### ****** ElasticNet. FOR white WINE ******
white_wine_clean_elastic_net: ElasticNet = w_create_model(
    regressor=ElasticNet(),
    name="white_wine_clean_elastic_net",
    scaler=white_wine_min_max_scaler
)

### ****** ElasticNet WITH HYPER PARAMS. FOR white WINE ******
white_wine_hyper_params_elastic_net: ElasticNet = w_create_model(
    regressor=ElasticNet(tol=1e-8, l1_ratio=0.2),
    name="white_wine_hyper_params_elastic_net",
    scaler=white_wine_min_max_scaler
)

### ****** Lasso. FOR white WINE ******
white_wine_clean_l1: Lasso = w_create_model(
    regressor=Lasso(),
    name="white_wine_clean_l1",
    scaler=white_wine_min_max_scaler
)

### ****** Lasso WITH HYPER PARAMS. FOR white WINE ******
white_wine_hyper_params_l1: Lasso = w_create_model(
    regressor=Lasso(alpha=0.5, tol=1e-6),
    name="white_wine_hyper_params_l1",
    scaler=white_wine_min_max_scaler
)

''''''''''''''''''''''''''''''''''''''''''''''''''
         white_wine_linear_clean_regressor
                     Score: 0.2798382098807787
                     ''''''''''''''''''''''''''
                        R2: 0.32315987804187085
  Mean_squared_error (MSE): 0.5473846342505478
 Mean_absolute_error (MAE): 0.573706650152439
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
         white_wine_linear_hyper_params_regressor
                     Score: 0.2790062591703607
                     ''''''''''''''''''''''''''
                        R2: 0.32507312707834235
  Mean_squared_error (MSE): 0.5458373218349816
 Mean_absolute_error (MAE): 0.5725361968909619
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
         white_wine_clean_sgdegressor
                     Score: 0.21746154037129273
                     ''''''''''''''''''''''''''
                        R2: 0.283897757641923

In [20]:
def w_value_results(model, name:str):
  print("'"*50)
  print(name)
  model_predict = model.predict(X=w_test_features_X)

  ## Metrics
  model_r2: np.float64 = r2_score(y_true=w_test_target_y, y_pred=model_predict)
  print(f"                        R2: {model_r2}")

  model_mean_squared_error: np.float64 = mean_squared_error(y_true=w_test_target_y, y_pred=model_predict)
  print(f"  Mean_squared_error (MSE): {model_mean_squared_error}")

  model_mean_absolute_error: np.float64 = mean_absolute_error(y_true=w_test_target_y, y_pred=model_predict)
  print(f" Mean_absolute_error (MAE): {model_mean_absolute_error}")
  print("'"*50)

In [21]:
### Оцінка результатів
w_value_results(model=white_wine_linear_clean_regressor, name="white_wine_linear_clean_regressor")
w_value_results(model=white_wine_linear_hyper_params_regressor, name="white_wine_linear_hyper_params_regressor")
w_value_results(model=white_wine_clean_sgdegressor, name="white_wine_clean_sgdegressor")
w_value_results(model=white_wine_hyper_params_sgdegressor, name="white_wine_hyper_params_sgdegressor")
w_value_results(model=white_wine_clean_mb_sgdegressor, name="white_wine_clean_mb_sgdegressor")
w_value_results(model=white_wine_hyper_params_mb_sgdegressor, name="white_wine_hyper_params_mb_sgdegressor")
w_value_results(model=white_wine_clean_elastic_net, name="white_wine_clean_elastic_net")
w_value_results(model=white_wine_hyper_params_elastic_net, name="white_wine_hyper_params_elastic_net")
w_value_results(model=white_wine_clean_l1, name="white_wine_clean_l1")
w_value_results(model=white_wine_hyper_params_l1, name="white_wine_hyper_params_l1")

''''''''''''''''''''''''''''''''''''''''''''''''''
white_wine_linear_clean_regressor
                        R2: 0.31138512734406587
  Mean_squared_error (MSE): 0.5191112879372428
 Mean_absolute_error (MAE): 0.5669367283950617
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
white_wine_linear_hyper_params_regressor
                        R2: 0.31404628762152464
  Mean_squared_error (MSE): 0.5171051762572685
 Mean_absolute_error (MAE): 0.5648479069211355
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
white_wine_clean_sgdegressor
                        R2: 0.245007916521906
  Mean_squared_error (MSE): 0.5691496486637178
 Mean_absolute_error (MAE): 0.5935501603275207
''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''
white_wine_hyper_params_sgdegressor
                        R2: 0.2989873287148943
  Mean_squared_error (MSE): 0.52