# Orange brix Analysis

## Import libraries

In [190]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from enum import Enum

In [191]:
plt.style.use("fivethirtyeight")

In [192]:
GENERATE_PLOTS: bool = False

## Create main variables

In [193]:
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    RobustScaler,
    StandardScaler,
)


class Scaler(Enum):
    Raw_ = None
    MaxAbsScaler_ = MaxAbsScaler
    MinMaxScaler_ = MinMaxScaler
    Normalizer_ = Normalizer
    RobustScaler_ = RobustScaler
    StandardScaler_ = StandardScaler


class Dataset:
    def __init__(
        self,
        name: str,
        dataframe: pd.DataFrame,
        scaler_type: Scaler.__name__ = Scaler.Raw_.name,
    ):
        self.name: str = name
        self.dataframe: pd.DataFrame = dataframe
        self.categorical_cols: list[str] = []
        self.numeric_cols: list[str] = []
        self.scaler_type: str = Scaler.Raw_.name

        self.detect_categorical_numeric_cols()

    def get_name(self) -> str:
        return self.name

    def get_dataframe(self):
        return self.dataframe

    def detect_categorical_numeric_cols(self):
        if len(self.dataframe) == 0:
            raise Exception("No records found!")

        self.numeric_cols = []
        self.categorical_cols = []

        for col in self.dataframe.columns:
            try:
                float(self.dataframe[col][0])
                self.numeric_cols.append(col)
            except:
                self.categorical_cols.append(col)

    def concat_dataframe(self, additional_dataframe: pd.DataFrame):
        if len(additional_dataframe) == len(self.dataframe):
            self.dataframe = pd.concat([self.dataframe, additional_dataframe], axis=1)
            self.detect_categorical_numeric_cols()

    def get_scaled_dataframe(self, scaler: Scaler) -> pd.DataFrame:
        if self.scaler_type.value:
            print("The dataframe already has been scaled!")
        else:
            if scaler.value:
                scaled_dataframe = pd.DataFrame(
                    scaler.value().fit_transform(X=self.dataframe[self.numeric_cols]),
                    columns=self.numeric_cols,
                )
                scaled_dataframe[self.categorical_cols] = self.dataframe[
                    self.categorical_cols
                ]

                return scaled_dataframe
            else:
                print("Cannot revert the scaling!")

    def get_one_hot_vectorized_dataset(
        self, categorical_col: str, print_name: bool = True
    ):
        if categorical_col in self.categorical_cols:
            dummies_dataframe = pd.get_dummies(self.dataframe[categorical_col]).astype(
                "int"
            )
            dummies_dataframe = pd.concat([dummies_dataframe, self.dataframe], axis=1)
            new_dataset_name = self.name + "_ohv_" + categorical_col
            if print_name:
                print(new_dataset_name)
            return Dataset(
                name=new_dataset_name,
                dataframe=dummies_dataframe,
                scaler_type=Scaler.Raw_.name,
            )
        elif categorical_col in self.numeric_cols:
            print("Records for input column name are not categorical!")
        else:
            print("Input column name does not exists!")

In [194]:
# !pip install lightgbm
# !pip install xgboost

In [195]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error


class ModelType(Enum):
    LINEAR_REGRESSION: dict = {}
    SVR: dict = {
        "kernel": "rbf",
        "degree": 3,
        "gamma": "scale",
        "coef0": 0,
        "tol": 1e-3,
        "C": 1.0,
        "epsilon": 0.1,
    }
    DECISION_TREE: dict = {
        "criterion": "squared_error",
        "splitter": "best",
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "min_weight_fraction_leaf": 0.0,
        "max_features": None,
        "random_state": 42,
        "max_leaf_nodes": None,
        "min_impurity_decrease": 0,
    }
    RANDOM_FOREST: dict = {
        "criterion": "squared_error",
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "min_weight_fraction_leaf": 0.0,
        "max_features": 1.0,
        "max_leaf_nodes": None,
        "min_impurity_decrease": 0.0,
    }
    LIGHT_GBM: dict = {
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "max_depth": -1,
        "learning_rate": 0.1,
        "n_estimator": 100,
    }
    XGB: dict = {"objective": "reg:linear", "n_estimators": 10, "seed": 123}


class Model:
    def __init__(
        self,
        model_type: ModelType,
        name: str,
        x_variable: list,
        y_variable: list,
        dataframe: pd.DataFrame,
        scaler=None,
    ):
        self.model_type: ModelType = model_type
        self.name: str = name
        self.x_variable: list[str] = x_variable
        self.y_variable: list[str] = y_variable
        self.dataframe: pd.DataFrame = dataframe
        self.inner_model = None
        self.scaler = scaler

    def fit(self, X=None, Y=None):
        model_family = self.model_type.__str__().split(".")[-1]
        if X is None:
            X = np.array(self.dataframe[self.x_variable])
            Y = np.array(self.dataframe[self.y_variable])
        Y = np.squeeze(Y)
        params = self.model_type.value

        if self.scaler is not None:
            self.scaler.fit(X)
            X = self.scaler.transform(X)

        if model_family == "SVR":
            self.inner_model = SVR(**params).fit(X, Y)
        if model_family == "LINEAR_REGRESSION":
            self.inner_model = LinearRegression(**params).fit(X, Y)
        if model_family == "DECISION_TREE":
            self.inner_model = DecisionTreeRegressor(**params).fit(X, Y)
        if model_family == "RANDOM_FOREST":
            self.inner_model = RandomForestRegressor(**params).fit(X, Y)
        if model_family == "LIGHT_GBM":
            self.inner_model = LGBMRegressor(**params).fit(X, Y)
        if model_family == "XGB":
            self.inner_model = XGBRegressor(**params).fit(X, Y)

    def predict(self, X):
        if self.scaler is not None:
            X = self.scaler.transform(X)
        return self.inner_model.predict(X)

    def evaluate(self, X=None, Y=None, method="LOOCV"):
        if X is None:
            # print(self.dataframe[x_variable])
            X = np.array(self.dataframe[self.x_variable])
            Y = np.array(self.dataframe[self.y_variable])
        if method == "LOOCV":
            preds = [0] * len(Y)
            total = len(Y)
            kf = KFold(n_splits=total)
            kf.get_n_splits(Y)

        with tqdm(total=total) as pbar:
            for i, (train_index, valid_index) in enumerate(kf.split(X, Y)):
                X_train = X[train_index]
                Y_train = Y[train_index]
                X_valid = X[valid_index]
                Y_valid = Y[valid_index]
                self.fit(X_train, Y_train)

                Y_valid_pred = self.predict(X_valid)
                for j in range(len(valid_index)):
                    index = valid_index[j]
                    value = Y_valid_pred[j]
                    preds[index] = value
                pbar.update(1)

        r2 = r2_score(Y, preds)
        mse = mean_squared_error(Y, preds)
        print(f"MSE: {mse:.4f}", f"R2: {r2:.4f}")

    def save(self, filename: str):
        pass

    def load(self, filename: str):
        pass

    def set_params(self, params: dict):
        pass

In [196]:
my_modeltype = ModelType.XGB
a = my_modeltype.value["depth"] = 1000
my_modeltype

<ModelType.XGB: {'objective': 'reg:linear', 'n_estimators': 10, 'seed': 123, 'depth': 1000}>

In [197]:
list_dataset: dict[str, Dataset] = dict()
list_model: dict = dict()

## Create functions

In [198]:
def list_all_dataset_names():
    return list_dataset.keys()

In [199]:
def list_all_model_names():
    return list_model.keys()

In [200]:
def plot_along_dataset(
    dataset: Dataset,
    nrow_subplot: int,
    ncol_subplot: int,
    figsize: tuple[int, int],
):
    numeric_df = dataset.get_dataframe()[dataset.numeric_cols]
    x = np.arange(len(numeric_df))
    figs, axes = plt.subplots(nrow_subplot, ncol_subplot, figsize=figsize)
    for i in range(nrow_subplot):
        for j in range(ncol_subplot):
            ax = axes[i][j]
            index = i * ncol_subplot + j
            ax.plot(x, numeric_df.iloc[:, index])
            ax.set_title(f"{numeric_df.columns[index]} along dataframe", size=15)
            ax.xaxis.set_tick_params(labelsize=12)
            ax.xaxis.set_ticks(np.arange(0, len(numeric_df), 10))
            ax.yaxis.set_tick_params(labelsize=12)

In [201]:
def plot_hist_kde_box_all_columns(dataset: Dataset):

    number_of_numeric_col = len(dataset.numeric_cols)
    figs, axes = plt.subplots(
        number_of_numeric_col, 3, figsize=(30, number_of_numeric_col * 10)
    )

    numeric_dataframe = dataset.get_dataframe()[dataset.numeric_cols]

    for i in range(number_of_numeric_col):
        axes[i][0].set_title(f"{numeric_dataframe.columns[i]} histogram", size=20)
        axes[i][0].yaxis.set_tick_params(labelsize=15)
        axes[i][0].xaxis.set_tick_params(labelsize=15)
        sns.histplot(ax=axes[i][0], data=numeric_dataframe.iloc[:, i])

        axes[i][1].set_title(f"{numeric_dataframe.columns[i]} KDE", size=20)
        axes[i][1].xaxis.set_tick_params(labelsize=15)
        axes[i][1].yaxis.set_tick_params(labelsize=15)
        sns.kdeplot(ax=axes[i][1], data=numeric_dataframe.iloc[:, i])

        axes[i][2].set_title(f"{numeric_dataframe.columns[i]} Boxplot", size=20)
        axes[i][2].xaxis.set_tick_params(labelsize=15)
        axes[i][2].yaxis.set_tick_params(labelsize=15)
        sns.boxplot(ax=axes[i][2], data=numeric_dataframe.iloc[:, i])

In [202]:
def plot_all_columns_group_by_a_categorical_col(dataset: Dataset, categorical_col: str):

    dataframe = dataset.get_dataframe()
    numeric_dataframe = dataframe[dataset.numeric_cols]
    number_of_numeric_col = len(dataset.numeric_cols)
    figs, axes = plt.subplots(
        number_of_numeric_col, 2, figsize=(20, number_of_numeric_col * 10)
    )
    groups = dataframe[categorical_col].unique()

    for i in range(number_of_numeric_col):
        axes[i][0].set_title(f"{numeric_dataframe.columns[i]} KDE", size=20)
        axes[i][0].xaxis.set_tick_params(labelsize=15)
        axes[i][0].yaxis.set_tick_params(labelsize=15)
        for group in groups:
            sns.kdeplot(
                ax=axes[i][0],
                data=numeric_dataframe[dataframe[categorical_col] == group].iloc[:, i],
                label=f"Type {group}",
            )
        axes[i][0].legend()

        axes[i][1].set_title(f"{numeric_dataframe.columns[i]} Boxplot", size=20)
        axes[i][1].xaxis.set_tick_params(labelsize=15)
        axes[i][1].yaxis.set_tick_params(labelsize=15)
        sns.boxplot(
            ax=axes[i][1],
            hue="type",
            y=numeric_dataframe.columns[i],
            data=dataframe,
        )

In [203]:
def divide_dataframe_with_categorical_col(
    dataset: Dataset, categorical_col: str
) -> dict[str, pd.DataFrame]:
    dataframe = dataset.get_dataframe()
    groups = dataframe[categorical_col].unique()

    result = {}
    for group in groups:
        result[group] = dataframe[dataframe[categorical_col] == group]

    return result

In [204]:
# def calculate_p_values_between_partitions(dataframes, columns_to_compare):
#     p_values = {}
#     for key1, df1 in dataframes.items():
#         for key2, df2 in dataframes.items():
#             if key1 != key2:
#                 for col in columns_to_compare:
#                     if col in df1 and col in df2:
#                         t_stat, p_val = stats.ttest_ind(df1[col], df2[col])
#                         p_values[f"{col}_{key1}-{col}_{key2}"] = p_val
#     return p_values

## Import data

In [205]:
raw_df = pd.read_excel("orange_data.xlsx", sheet_name=0)

In [206]:
raw_df.head(10)

Unnamed: 0,Name,p,C NaOH,Khối lượng(g),Đường kính ngang(mm),Đường kính dọc(mm),Độ đường (Brix %),TA (%),Brix:TA,BrmTA
0,A1,1.9,0.4,255,257,262,10.9,4.867141,2.239508,6.032859
1,A2,1.2,0.4,325,288,278,9.8,3.073984,3.188045,6.726016
2,A3,2.3,0.4,185,235,237,11.0,5.891803,1.867001,5.108197
3,A4,1.4,0.4128,203,245,245,9.0,3.701077,2.431725,5.298923
4,A5,1.2,0.4,229,253,249,10.9,3.073984,3.545887,7.826016
5,A7,1.3,0.4128,357,305,290,10.7,3.436714,3.113439,7.263286
6,A8,2.0,0.4,309,275,285,11.2,5.123307,2.186088,6.076693
7,A9,1.9,0.4128,321,285,280,9.2,5.02289,1.831615,4.17711
8,A10,1.45,0.4082,245,255,259,9.5,3.790542,2.506238,5.709458
9,A11,2.0,0.4128,275,267,270,10.5,5.287252,1.985909,5.212748


In [207]:
raw_df.columns

Index(['Name', 'p', 'C NaOH', 'Khối lượng(g)', 'Đường kính ngang(mm)',
       'Đường kính dọc(mm)', 'Độ đường (Brix %)', 'TA (%)', 'Brix:TA',
       'BrmTA'],
      dtype='object')

In [208]:
raw_df.shape

(81, 10)

In [209]:
new_column_names = {
    "Name": "name",
    "p": "v_naoh",
    "C NaOH": "c_naoh",
    "Khối lượng(g)": "mass",
    "Đường kính ngang(mm)": "h_diameter",
    "Đường kính dọc(mm)": "v_diameter",
    "Độ đường (Brix %)": "brix",
    "TA (%)": "ta",
    "Brix:TA": "brix_ta",
    "BrmTA": "brm_ta",
}

std_col_name_df = raw_df.rename(columns=new_column_names)

In [210]:
std_col_name_df.head(10)

Unnamed: 0,name,v_naoh,c_naoh,mass,h_diameter,v_diameter,brix,ta,brix_ta,brm_ta
0,A1,1.9,0.4,255,257,262,10.9,4.867141,2.239508,6.032859
1,A2,1.2,0.4,325,288,278,9.8,3.073984,3.188045,6.726016
2,A3,2.3,0.4,185,235,237,11.0,5.891803,1.867001,5.108197
3,A4,1.4,0.4128,203,245,245,9.0,3.701077,2.431725,5.298923
4,A5,1.2,0.4,229,253,249,10.9,3.073984,3.545887,7.826016
5,A7,1.3,0.4128,357,305,290,10.7,3.436714,3.113439,7.263286
6,A8,2.0,0.4,309,275,285,11.2,5.123307,2.186088,6.076693
7,A9,1.9,0.4128,321,285,280,9.2,5.02289,1.831615,4.17711
8,A10,1.45,0.4082,245,255,259,9.5,3.790542,2.506238,5.709458
9,A11,2.0,0.4128,275,267,270,10.5,5.287252,1.985909,5.212748


In [211]:
std_col_name_dataset = Dataset("std_col_name", std_col_name_df, Scaler.Raw_.name)

In [212]:
print(std_col_name_dataset.categorical_cols)
print(std_col_name_dataset.numeric_cols)
std_col_name_dataset.dataframe

['name']
['v_naoh', 'c_naoh', 'mass', 'h_diameter', 'v_diameter', 'brix', 'ta', 'brix_ta', 'brm_ta']


Unnamed: 0,name,v_naoh,c_naoh,mass,h_diameter,v_diameter,brix,ta,brix_ta,brm_ta
0,A1,1.90,0.4000,255,257,262,10.9,4.867141,2.239508,6.032859
1,A2,1.20,0.4000,325,288,278,9.8,3.073984,3.188045,6.726016
2,A3,2.30,0.4000,185,235,237,11.0,5.891803,1.867001,5.108197
3,A4,1.40,0.4128,203,245,245,9.0,3.701077,2.431725,5.298923
4,A5,1.20,0.4000,229,253,249,10.9,3.073984,3.545887,7.826016
...,...,...,...,...,...,...,...,...,...,...
76,C24,1.60,0.4016,177,225,216,12.5,4.115040,3.037638,8.384960
77,C25,1.60,0.4016,173,216,220,11.1,4.115040,2.697422,6.984960
78,C26,0.95,0.4016,126,195,197,12.8,2.443305,5.238806,10.356695
79,C28,1.70,0.4016,181,216,232,11.8,4.372230,2.698852,7.427770


In [213]:
list_dataset["std_col_name"] = std_col_name_dataset

In [214]:
list_dataset["std_col_name"].get_dataframe().head(10)

Unnamed: 0,name,v_naoh,c_naoh,mass,h_diameter,v_diameter,brix,ta,brix_ta,brm_ta
0,A1,1.9,0.4,255,257,262,10.9,4.867141,2.239508,6.032859
1,A2,1.2,0.4,325,288,278,9.8,3.073984,3.188045,6.726016
2,A3,2.3,0.4,185,235,237,11.0,5.891803,1.867001,5.108197
3,A4,1.4,0.4128,203,245,245,9.0,3.701077,2.431725,5.298923
4,A5,1.2,0.4,229,253,249,10.9,3.073984,3.545887,7.826016
5,A7,1.3,0.4128,357,305,290,10.7,3.436714,3.113439,7.263286
6,A8,2.0,0.4,309,275,285,11.2,5.123307,2.186088,6.076693
7,A9,1.9,0.4128,321,285,280,9.2,5.02289,1.831615,4.17711
8,A10,1.45,0.4082,245,255,259,9.5,3.790542,2.506238,5.709458
9,A11,2.0,0.4128,275,267,270,10.5,5.287252,1.985909,5.212748


In [215]:
if GENERATE_PLOTS:
    plot_along_dataset(list_dataset["std_col_name"], 3, 3, (12, 12))

#### From the shape of lines in graph "mass", "h_diameter", "v_diameter" and "brix", we can see the lines could be divided into 3 levels along dataset. We assume that there are 3 types of data in the dataset. We could double check the name / label of dataset.

In [216]:
list_dataset["std_col_name"].get_dataframe().columns

Index(['name', 'v_naoh', 'c_naoh', 'mass', 'h_diameter', 'v_diameter', 'brix',
       'ta', 'brix_ta', 'brm_ta'],
      dtype='object')

In [217]:
print(list_dataset["std_col_name"].get_dataframe()["name"].to_list())

['A1', 'A2', 'A3', 'A4', 'A5', 'A7', 'A8', 'A9', 'A10', 'A11', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A20', 'A21', 'A24', 'A25', 'A26', 'A27', 'A28', 'A29', 'A30', 'B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B9', 'B10', 'B11', 'B12', 'B13', 'B14', 'B15', 'B16', 'B17', 'B18', 'B19', 'B20', 'B21', 'B22', 'B23', 'B24', 'B25', 'B26', 'B27', 'B28', 'B29', 'B30', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C8', 'C9', 'C10', 'C11', 'C12', 'C14', 'C15', 'C16', 'C17', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C28', 'C29']


We count the number of type to see whether the amount is equivalent to the shape of lines in the following graphs: "mass", "h_diameter", "v_diameter" and "brix"

In [218]:
import re

data_type = []
for name in list_dataset["std_col_name"].get_dataframe()["name"].str.upper():
    data_type.append(re.findall("[A-Z]", name)[0])

data_type = pd.DataFrame(data_type, columns=["type"])
data_type["count"] = 1
data_type.groupby(["type"]).count()["count"]

type
A    26
B    30
C    25
Name: count, dtype: int64

In [219]:
list_dataset["std_col_name"].concat_dataframe(data_type["type"])

In [220]:
list_dataset["std_col_name"].get_dataframe().head(10)

Unnamed: 0,name,v_naoh,c_naoh,mass,h_diameter,v_diameter,brix,ta,brix_ta,brm_ta,type
0,A1,1.9,0.4,255,257,262,10.9,4.867141,2.239508,6.032859,A
1,A2,1.2,0.4,325,288,278,9.8,3.073984,3.188045,6.726016,A
2,A3,2.3,0.4,185,235,237,11.0,5.891803,1.867001,5.108197,A
3,A4,1.4,0.4128,203,245,245,9.0,3.701077,2.431725,5.298923,A
4,A5,1.2,0.4,229,253,249,10.9,3.073984,3.545887,7.826016,A
5,A7,1.3,0.4128,357,305,290,10.7,3.436714,3.113439,7.263286,A
6,A8,2.0,0.4,309,275,285,11.2,5.123307,2.186088,6.076693,A
7,A9,1.9,0.4128,321,285,280,9.2,5.02289,1.831615,4.17711,A
8,A10,1.45,0.4082,245,255,259,9.5,3.790542,2.506238,5.709458,A
9,A11,2.0,0.4128,275,267,270,10.5,5.287252,1.985909,5.212748,A


## Analyze quantitative data

### Plots for all rows

In [221]:
if GENERATE_PLOTS:
    plot_hist_kde_box_all_columns(list_dataset["std_col_name"])

In [222]:
list_dataset["std_col_name"].get_dataframe().describe()

Unnamed: 0,v_naoh,c_naoh,mass,h_diameter,v_diameter,brix,ta,brix_ta,brm_ta
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,1.477778,0.406857,222.197531,241.777778,241.358025,11.169259,3.849815,3.024935,7.319445
std,0.297069,0.007092,50.582215,23.652167,22.587225,1.362702,0.774895,0.78652,1.587514
min,0.8,0.4,126.0,195.0,197.0,7.4,2.05752,1.795715,4.076693
25%,1.3,0.4016,179.0,222.0,223.0,10.4,3.370111,2.512846,6.148368
50%,1.4,0.4048,228.0,242.0,242.0,11.1,3.701077,2.920633,7.332125
75%,1.6,0.4082,255.0,257.0,255.0,12.3,4.229802,3.371688,8.199505
max,2.3,0.423,357.0,305.0,290.0,14.0,5.891803,6.609899,11.54248


In [223]:
if GENERATE_PLOTS:
    plot_all_columns_group_by_a_categorical_col(list_dataset["std_col_name"], "type")

#### From above plots, we see that "mass", "h_diameter", "v_diameter" and "brm_ta" plots have clearly patterns corresponding with types.

## Analyze qualitative data

In [224]:
# partitions_dataframe = divide_dataframe_with_categorical_col(list_dataset["std_col_name"], "type")

### Use Welch's t-test to see whether the data could be divided into 3 parts A, B, and C.

In [225]:
# for group in partitions_dataframe.keys():
#     print(f"{group}: {partitions_dataframe[group].shape}")

In [226]:
# partitions_dataframe["A"].head(10)

In [227]:
# columns_to_compare = ["mass", "h_diameter", "v_diameter"]

In [228]:
# calculate_p_values_between_partitions(partitions_dataframe, columns_to_compare)

### All p-values smaller than $ \alpha $ = 0.05
### => Conclusion from Welch's t-test: The dataset could be divided into 3 parts A, B and C

## Matrix plots

### Pairplots

In [229]:
if GENERATE_PLOTS:
    plt.figure(figsize=(16, 16))
    sns.pairplot(list_dataset["std_col_name"].get_dataframe())

#### Some potential features to explain brix are "mass", "h_diameter", "v_diameter", "brix_ta", "brm_ta".

### Heatmap

In [230]:
if GENERATE_PLOTS:
    plt.figure(figsize=(12, 12))
    dataframe = list_dataset["std_col_name"].get_dataframe()
    numeric_cols = list_dataset["std_col_name"].numeric_cols
    dataset_corr = dataframe[numeric_cols].corr()
    sns.heatmap(dataset_corr, annot=True)

#### => Some features that could be used for model: **mass, h_diameter, v_diameter, brix_ta, brm_ta** (quantitative) and **type** (qualitative) to explain **brix**. However, **brix_ta** and **brm_ta** are directly related to **brix**, then they will not be considered.

## Build Models

### Create "ohv_std_col_name" dataframe: ["mass", "h_diameter", "v_diameter", "type", "brix", one-hot-vector-of-type]

In [232]:
ohv_dataset = list_dataset["std_col_name"].get_one_hot_vectorized_dataset("type")
list_dataset["std_col_name_ohv_type"] = ohv_dataset
list_dataset["std_col_name_ohv_type"].get_dataframe().head(10)

std_col_name_ohv_type


Unnamed: 0,A,B,C,name,v_naoh,c_naoh,mass,h_diameter,v_diameter,brix,ta,brix_ta,brm_ta,type
0,1,0,0,A1,1.9,0.4,255,257,262,10.9,4.867141,2.239508,6.032859,A
1,1,0,0,A2,1.2,0.4,325,288,278,9.8,3.073984,3.188045,6.726016,A
2,1,0,0,A3,2.3,0.4,185,235,237,11.0,5.891803,1.867001,5.108197,A
3,1,0,0,A4,1.4,0.4128,203,245,245,9.0,3.701077,2.431725,5.298923,A
4,1,0,0,A5,1.2,0.4,229,253,249,10.9,3.073984,3.545887,7.826016,A
5,1,0,0,A7,1.3,0.4128,357,305,290,10.7,3.436714,3.113439,7.263286,A
6,1,0,0,A8,2.0,0.4,309,275,285,11.2,5.123307,2.186088,6.076693,A
7,1,0,0,A9,1.9,0.4128,321,285,280,9.2,5.02289,1.831615,4.17711,A
8,1,0,0,A10,1.45,0.4082,245,255,259,9.5,3.790542,2.506238,5.709458,A
9,1,0,0,A11,2.0,0.4128,275,267,270,10.5,5.287252,1.985909,5.212748,A


working: create model

In [None]:
y_variable = list_dataset["ohv_std_col_name"].get_dataframe()["brix"]
y_variable.head(10)

0    10.9
1     9.8
2    11.0
3     9.0
4    10.9
5    10.7
6    11.2
7     9.2
8     9.5
9    10.5
Name: brix, dtype: float64

In [None]:
x_variable = list_dataset["ohv_std_col_name"].get_dataframe().drop(columns="brix")
x_variable.head(10)

Unnamed: 0,A,B,C,name,v_naoh,c_naoh,mass,h_diameter,v_diameter,ta,brix_ta,brm_ta,type
0,True,False,False,A1,1.9,0.4,255,257,262,4.867141,2.239508,6.032859,A
1,True,False,False,A2,1.2,0.4,325,288,278,3.073984,3.188045,6.726016,A
2,True,False,False,A3,2.3,0.4,185,235,237,5.891803,1.867001,5.108197,A
3,True,False,False,A4,1.4,0.4128,203,245,245,3.701077,2.431725,5.298923,A
4,True,False,False,A5,1.2,0.4,229,253,249,3.073984,3.545887,7.826016,A
5,True,False,False,A7,1.3,0.4128,357,305,290,3.436714,3.113439,7.263286,A
6,True,False,False,A8,2.0,0.4,309,275,285,5.123307,2.186088,6.076693,A
7,True,False,False,A9,1.9,0.4128,321,285,280,5.02289,1.831615,4.17711,A
8,True,False,False,A10,1.45,0.4082,245,255,259,3.790542,2.506238,5.709458,A
9,True,False,False,A11,2.0,0.4128,275,267,270,5.287252,1.985909,5.212748,A


### Linear regression models

In [None]:
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from typing import Optional, Callable


def summary_linear_model(x: list, y, dataset, transformer: Optional[Callable] = None):
    x_ = sm.add_constant(dataset[x])
    y_ = dataset[y]
    results = sm.OLS(y_, x_).fit()
    Intercept = results.params.iloc[0]
    Slope = results.params.iloc[1:]
    P_values = results.pvalues[1:]
    R_squared = results.rsquared
    MSE = mean_squared_error(y_, results.predict())
    print("Intercept\n", Intercept, end="\n\n")
    print("Slope\n", Slope, end="\n\n")
    print("P_values\n", P_values, end="\n\n")
    print("R_squared\n", R_squared, end="\n\n")
    print("MSE\n", MSE, end="\n\n")
    return {
        "Intercept": Intercept,
        "Slope": Slope,
        "P_values": P_values,
        "R_squared": R_squared,
        "MSE": MSE,
    }

#### Model 2: $ brix \sim \beta_0 + \beta_1* $ h.diameter

In [None]:
summary_linear_model(x=["h_diameter"], y="brix", dataset=model_dataset)

NameError: name 'model_dataset' is not defined

#### Model 3: $ brix \sim \beta_0 + \beta_1* $ v.diameter

In [None]:
summary_linear_model(x=["v_diameter"], y="brix", dataset=model_dataset)

#### Model 4: $ brix \sim \beta_0 + \beta_1* $ mass

In [None]:
summary_linear_model(x=["mass"], y="brix", dataset=model_dataset)

#### Model 5: brix $ \sim \beta_0 $ + $ \beta_1 $ * type_a + $ \beta_2 $ * type_b + $ \beta_3 $ * type_c + $ \beta_4 $ * h.diameter + $ \beta_5 $ * v.diameter + $ \beta_6 $ * mass

In [None]:
model_no_scale = summary_linear_model(
    x=["A", "B", "C", "h_diameter", "v_diameter", "mass"],
    y="brix",
    dataset=model_dataset.drop(columns="type").astype("float"),
)
model_no_scale

In [None]:
intercept_a = model_no_scale["Slope"]["A"] + model_no_scale["Intercept"]
intercept_b = model_no_scale["Slope"]["B"] + model_no_scale["Intercept"]
intercept_c = model_no_scale["Slope"]["C"] + model_no_scale["Intercept"]

x = np.arange(model_dataset["mass"].min(), model_dataset["mass"].max())
ya = x * model_no_scale["Slope"]["mass"] + intercept_a
yb = x * model_no_scale["Slope"]["mass"] + intercept_b
yc = x * model_no_scale["Slope"]["mass"] + intercept_c
plt.plot(x, ya, linewidth=2, label="type_A")
plt.plot(x, yb, linewidth=2, label="type_B")
plt.plot(x, yc, linewidth=2, label="type_C")
plt.legend()

#### => Keep "type" predictor variable since typeC is significantly different from others.

## Scale data

In [None]:
model_dataset.sample(10)

In [None]:
model_no_scale

### Data preparation

### No scale

In [None]:
model_dataset_scale["NoScale"] = model_dataset
all_models["NoScale"] = model_no_scale

### MaxAbsScaler

In [None]:
scaler = MaxAbsScaler

scaler_name = scaler.__name__
model_dataset_scale[scaler_name], all_models[scaler_name] = get_model_with_scaler(
    scaler, categorical_columns, numeric_columns, "brix", model_dataset
)

### MinMaxScaler

In [None]:
scaler = MinMaxScaler

scaler_name = scaler.__name__
model_dataset_scale[scaler_name], all_models[scaler_name] = get_model_with_scaler(
    scaler, categorical_columns, numeric_columns, "brix", model_dataset
)

### Normalizer

In [None]:
scaler = Normalizer

scaler_name = scaler.__name__
model_dataset_scale[scaler_name], all_models[scaler_name] = get_model_with_scaler(
    scaler, categorical_columns, numeric_columns, "brix", model_dataset
)

### RobustScaler

In [None]:
scaler = RobustScaler

scaler_name = scaler.__name__
model_dataset_scale[scaler_name], all_models[scaler_name] = get_model_with_scaler(
    scaler, categorical_columns, numeric_columns, "brix", model_dataset
)

### StandardScaler

In [None]:
scaler = StandardScaler

scaler_name = scaler.__name__
model_dataset_scale[scaler_name], all_models[scaler_name] = get_model_with_scaler(
    scaler, categorical_columns, numeric_columns, "brix", model_dataset
)

## Test models

### Leave one out cross validation

Unnamed: 0,mass,h_diameter,v_diameter,type,brix,A,B,C
0,255,257,262,A,10.9,True,False,False
1,325,288,278,A,9.8,True,False,False
2,185,235,237,A,11.0,True,False,False
3,203,245,245,A,9.0,True,False,False
4,229,253,249,A,10.9,True,False,False
...,...,...,...,...,...,...,...,...
76,177,225,216,C,12.5,False,False,True
77,173,216,220,C,11.1,False,False,True
78,126,195,197,C,12.8,False,False,True
79,181,216,232,C,11.8,False,False,True


In [None]:
list_dataset["data_v1"].dataframe[["mass", "h_diameter",	"v_diameter", "type", "A", "B",	"C"]]

Unnamed: 0,mass,h_diameter,v_diameter,type,A,B,C
0,255,257,262,A,True,False,False
1,325,288,278,A,True,False,False
2,185,235,237,A,True,False,False
3,203,245,245,A,True,False,False
4,229,253,249,A,True,False,False
...,...,...,...,...,...,...,...
76,177,225,216,C,False,False,True
77,173,216,220,C,False,False,True
78,126,195,197,C,False,False,True
79,181,216,232,C,False,False,True


In [None]:
model.x_variable

['mass', 'h_diameter', 'v_diameter', 'A', 'B', 'C']

In [None]:
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    RobustScaler,
    StandardScaler,
)
from tqdm import tqdm

"""
   SVR: dict = {"kernel": "rbf", "degree": 3, "gamma": "scale", "coef0": 0, "tol": 1e-3, "C": 1.0, "epsilon": 0.1}
    DECISION_TREE: dict = {"criterion": "squared_error", "splitter": "best", "max_depth" : None,  "min_samples_split": 2, "min_samples_leaf": 1,  "min_weight_fraction_leaf": 0.0,  "max_features": None, "random_state": 42, "max_leaf_nodes": None, "min_impurity_decrease": 0}
    RANDOM_FOREST: dict = {"criterion": "squared_error", "max_depth": None, "min_samples_split": 2, "min_samples_leaf": 1, "min_weight_fraction_leaf": 0.0, "max_features": 1.0, "max_leaf_nodes": None, "min_impurity_decrease": 0.0}
    LIGHT_GBM: dict = {"boosting_type": "gbdt", "num_leaves" : 31, "max_depth":-1, "learning_rate" :0.1, "n_estimator": 100}
    XGB
"""

model = Model(
      model_type = ModelType.LIGHT_GBM,
        name = "model_1",
        x_variable = ["mass", "h_diameter",	"v_diameter", "A", "B",	"C"],
        y_variable =  ["brix"],
        dataframe =  list_dataset["data_v1"].dataframe,
        scaler = Normalizer())

model.evaluate()

  1%|█                                                                                  | 1/81 [00:00<00:39,  2.04it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.172625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.186375
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 115
[

  9%|███████▏                                                                           | 7/81 [00:00<00:05, 13.49it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.196375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.172625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 19%|███████████████▏                                                                  | 15/81 [00:00<00:02, 24.03it/s]

[LightGBM] [Info] Start training from score 11.177625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.167625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.193875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of use

 28%|███████████████████████▎                                                          | 23/81 [00:01<00:01, 29.61it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.190125
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.166375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 33%|███████████████████████████▎                                                      | 27/81 [00:01<00:01, 30.86it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.170125
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.201375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 38%|███████████████████████████████▍                                                  | 31/81 [00:01<00:01, 31.31it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.167625


 43%|███████████████████████████████████▍                                              | 35/81 [00:01<00:02, 22.06it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.168875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.153875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 52%|██████████████████████████████████████████▌                                       | 42/81 [00:01<00:01, 25.49it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.186375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.150125
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 62%|██████████████████████████████████████████████████▌                               | 50/81 [00:02<00:01, 28.91it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.172625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.182625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 67%|██████████████████████████████████████████████████████▋                           | 54/81 [00:02<00:00, 27.64it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.153250
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.161375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 74%|████████████████████████████████████████████████████████████▋                     | 60/81 [00:02<00:00, 25.82it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.148875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.166375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 78%|███████████████████████████████████████████████████████████████▊                  | 63/81 [00:02<00:00, 23.42it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.155125
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.152625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, num

 81%|██████████████████████████████████████████████████████████████████▊               | 66/81 [00:02<00:00, 20.31it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.141375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.170000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

 90%|█████████████████████████████████████████████████████████████████████████▉        | 73/81 [00:03<00:00, 24.04it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.162625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.157625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc

100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [00:03<00:00, 23.51it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.152625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from score 11.170125
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 6
[LightGBM] [Info] Start training from sc




In [None]:
model.evaluate()

<__main__.Model at 0x1cd35713e20>

In [None]:
"""
def evaluate(method = "LOOCV")
    preds = []
    labels = []
    for x_train, y_train, x_valid, y_valid in LOOCV(dataset):
        self.scaler.fit(x_train)
        x_train_transform = self.scaler.transform(x_train)
        self.model.fit(x_train_transform, y_train)

        x_valid_transform = self.scaler.transform(x_valid)
        pred = self.model.predict(x_test)

        preds.append(pred)
        labels.append(y_valid)
    
    r2 = sklearn.metrics.r_squared(preds, labels)
    mse = sklearn.metrics.mean_square_error(preds, labels)
    print(r2, mse)
"""