In [1]:

from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
# from VGBoost import VGBClassifier

In [2]:
import numpy as np
from pandas import DataFrame, concat
from numba import prange
from concurrent.futures import ThreadPoolExecutor
# Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.svm import NuSVR, SVC
from sklearn.base import BaseEstimator
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, SGDRegressor, LassoLars, Lasso, Ridge, ARDRegression, RANSACRegressor, HuberRegressor, TheilSenRegressor, LassoLarsIC
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from time import perf_counter
import collections.abc
from copy import deepcopy


class VGBRegressor(BaseEstimator):
    """_summary_
    Args:
        object (_type_): _description_
    """

    def __init__(self):
        """ Initialize VGBRegressor Object
        """
        self._ensemble = []

    def _metrics(self, vt, vp, model, time=None):
        """_summary_
        Args:
            vt (_type_): _description_
            vp (_type_): _description_
            model (_type_): _description_
            time (_type_, optional): _description_. Defaults to None.
        Returns:
            _type_: _description_
        """
        if self.custom_loss_metrics:
            return {'model': model, 'time': time, 'loss': self.custom_loss_metrics(vt, vp)}
        return {"model": model, "time": time, "loss": mean_squared_error(vt, vp)}

    def _create_model(self, X, y, model_name, time_it: bool = False):
        """_summary_
        Args:
            X (_type_): _description_
            y (_type_): _description_
            model_name (_type_): _description_
            time_it (bool, optional): _description_. Defaults to False.
        Returns:
            _type_: _description_
        """
        model = model_name()
        if time_it:
            begin = perf_counter()
            model.fit(X, y)
            end = perf_counter()
            return (model, end - begin)
        return (model.fit(X, y), None)

    def _get_metrics(self, model_name):
        """_summary_
        Args:
            model_name (_type_): _description_
        Returns:
            _type_: _description_
        """
        try:
            Xt, Xv, yt, yv = train_test_split(self._X, self._y)
            results = self._create_model(Xt, yt, model_name, time_it=False)
            model, time = results[0], results[1]
            return self._metrics(yv,
                                 model.predict(Xv), model, time)
        except Exception:
            return None

    def _get_results(self, X, y) -> list:
        """_summary_
        Args:
            X (_type_): _description_
            y (_type_): _description_
        Returns:
            list: _description_
        """
        results = []
        # self._X = self._minimax.fit_transform(self._robust.fit_transform(
        #         KNNImputer(weights='distance').fit_transform(X)))
        self._X = X
        self._y = y
        with ThreadPoolExecutor(max_workers=len(self._models)) as executor:
            res = executor.map(self._get_metrics, self._models)
            results = [i for i in res if i]
        return results

    def fit(
        self, X_train, y_train,
        early_stopping: bool = False,
        early_stopping_min_delta: float = 0.001,
        early_stopping_patience: int = 10,
        custom_models: list = None,
        learning_rate: float = 0.05,
        n_estimators: int = 100,
        warm_start: bool = False,
        complexity: bool = False,
        light: bool = True,
        custom_loss_metrics: object = False,
    ):
        """_summary_
        Args:
            X_train (_type_): _description_
            y_train (_type_): _description_
            early_stopping (bool, optional): _description_. Defaults to False.
            early_stopping_min_delta (float, optional): _description_. Defaults to 0.001.
            early_stopping_patience (int, optional): _description_. Defaults to 10.
            custom_models (list, optional): _description_. Defaults to None.
            learning_rate (float, optional): _description_. Defaults to 0.05.
            n_estimators (int, optional): _description_. Defaults to 100.
            warm_start (bool, optional): _description_. Defaults to False.
            complexity (bool, optional): _description_. Defaults to False.
            custom_loss_metrics (object, optional): _description_. Defaults to False.
        Returns:
            _type_: _description_
        """
        if custom_models:
            self._models = custom_models
        self.custom_loss_metrics = custom_loss_metrics
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.early_stopping = early_stopping
        self.early_stopping_min_delta = early_stopping_min_delta
        self.early_stopping_patience = early_stopping_patience
        if custom_models:
            
            self._models = custom_models
        else:
            if complexity:
                self._models = (DecisionTreeRegressor, LinearRegression, BayesianRidge, KNeighborsRegressor, HistGradientBoostingRegressor,
                                ElasticNet, LassoLars, Lasso, GradientBoostingRegressor, ExtraTreesRegressor,
                                BaggingRegressor, NuSVR, XGBRegressor, SGDRegressor, KernelRidge, MLPRegressor, LGBMRegressor,
                                Ridge, ARDRegression, RANSACRegressor, HuberRegressor, TheilSenRegressor, LassoLarsIC)
            elif light:
                self._models = (LGBMRegressor, ExtraTreesRegressor,
                                BaggingRegressor, RANSACRegressor, LassoLarsIC, BayesianRidge)
            else:
                self._models = (DecisionTreeRegressor, LinearRegression, BayesianRidge, KNeighborsRegressor, LGBMRegressor,
                                ElasticNet, LassoLars, Lasso, SGDRegressor, BaggingRegressor, ExtraTreesRegressor,
                                Ridge, ARDRegression, RANSACRegressor, LassoLarsIC)
        X_train = KNNImputer(weights='distance',
                             n_neighbors=10).fit_transform(deepcopy(X_train))
        self._y_mean = y_train.mean()
        # base model: mean
        # computer residuals: y - y hat
        # for n_estimators: a) y = prev residuals && residuals * learning rate
        # add early stopping
        # restore best weights
        # ada boost and adaptive scaling for learning rates

        preds = DataFrame(
            data={'yt': y_train, 'p0': np.full((len(y_train)), y_train - self._y_mean)})
        residuals = DataFrame(
            data={'r0': y_train - self._y_mean})
        errors = []
        if not early_stopping:
            if warm_start:
                for i in prange(1, self.n_estimators + 1):
                    y = residuals[f'r{i - 1}']
                    results = self._get_results(X_train, y)
                    min_loss = min(results, key=lambda x: x.get(
                        "loss", float('inf')))["loss"]  # https://stackoverflow.com/a/19619294
                    min_model = [i['model']
                                 for i in results if min_loss >= i['loss']][0]
                    preds[f'p{i}'] = residuals.sum(axis=1) + min_model.predict(
                        X_train) * self.learning_rate
                    residuals[f'r{i}'] = preds['yt'] - preds[f'p{i}']
                    if i % 3 == 0:
                        X_train[f"r{i}"] = residuals[f'r{i}'].copy()
                    try:
                        errors.append(mean_squared_error(
                            preds['yt'], preds[f'p{i}']))
                    except Exception:
                        df = concat(
                            [preds['yt'], preds[f'p{i - 1}']], axis=1).dropna()
                        errors.append(mean_squared_error(
                            df['yt'], df[f"p{i - 1}"]))
                    self._ensemble.append(min_model)
            else:
                for i in prange(1, self.n_estimators + 1):
                    y = residuals[f'r{i - 1}']
                    results = self._get_results(X_train, y)
                    min_loss = min(results, key=lambda x: x.get(
                        "loss", float('inf')))["loss"]  # https://stackoverflow.com/a/19619294
                    min_model = [i['model']
                                 for i in results if min_loss >= i['loss']][0]
                    preds[f'p{i}'] = residuals.sum(axis=1) + min_model.predict(
                        X_train) * self.learning_rate
                    residuals[f'r{i}'] = preds['yt'] - preds[f'p{i}']
                    errors.append(mean_squared_error(
                        preds['yt'], preds[f'p{i}']))
                    self._ensemble.append(min_model)
                    if errors[i - 1] == 0:
                        break
        else:
            return "TODO"
        min_error = min(errors)
        min_error_i = [i for i in prange(
            len(errors)) if errors[i] == min_error][0]
        self._ensemble, errors = self._ensemble[:
                                                min_error_i], errors[:min_error_i]
        residuals = residuals[:len(errors)]
        return self._ensemble, (residuals, errors)

    def predict(self, X_test):
        """_summary_
        Args:
            X_test (_type_): _description_
        Returns:
            _type_: _description_
        """
        try:
            val = self._ensemble[0]
        except Exception:
            return "Please train the model first"
        # X_test = self._robust.transform(self._minimax.transform(deepcopy(X_test)))
        preds = DataFrame(
            data={'p0': np.full((len(X_test)), self._y_mean)})
        for i in prange(len(self._ensemble)):
            preds[f"p{i}"] = self._ensemble[i].predict(X_test)
        preds_ = preds.sum(axis=1)
        return preds_

    def score(self, X_test, y_true):
        """
        Args:
            X_test (Iterable)
            y_true (Iterable)
        Returns:
            float: R2 Score for y_true and y_predicted
        """
        return r2_score(y_true, self.predict(X_test))

    def get_params(self):
        return self.__dict__


class VGBClassifier(BaseEstimator):
    """_summary_
    Args:
        object (_type_): _description_
    """

    def __init__(self):
        """ Initialize VGBRegressor Object
        """
        self._ensemble = []

    def _metrics(self, vt, vp, model, time=None):
        """_summary_
        Args:
            vt (_type_): _description_
            vp (_type_): _description_
            model (_type_): _description_
            time (_type_, optional): _description_. Defaults to None.
        Returns:
            _type_: _description_
        """
        if self.custom_loss_metrics:
            return {'model': model, 'time': time, 'loss': self.custom_loss_metrics(vt, vp)}
        return {"model": model, "time": time, "loss": mean_squared_error(vt, vp)}

    def _create_model(self, X, y, model_name, time_it: bool = False):
        """_summary_
        Args:
            X (_type_): _description_
            y (_type_): _description_
            model_name (_type_): _description_
            time_it (bool, optional): _description_. Defaults to False.
        Returns:
            _type_: _description_
        """
        model = model_name()
        if time_it:
            begin = perf_counter()
            model.fit(X, y)
            end = perf_counter()
            return (model, end - begin)
        return (model.fit(X, y), None)

    def _get_metrics(self, model_name):
        """_summary_
        Args:
            model_name (_type_): _description_
        Returns:
            _type_: _description_
        """
        try:
            Xt, Xv, yt, yv = train_test_split(self._X, self._y)
            results = self._create_model(Xt, yt, model_name, time_it=False)
            model, time = results[0], results[1]
            return self._metrics(yv,
                                 model.predict(Xv), model, time)
        except Exception:
            return None

    def _get_results(self, X, y) -> list:
        """_summary_
        Args:
            X (_type_): _description_
            y (_type_): _description_
        Returns:
            list: _description_
        """
        results = []
        # self._X = self._minimax.fit_transform(self._robust.fit_transform(
        #         KNNImputer(weights='distance').fit_transform(X)))
        self._X = X
        self._y = y
        with ThreadPoolExecutor(max_workers=len(self._models)) as executor:
            res = executor.map(self._get_metrics, self._models)
            results = [i for i in res if i]
        return results

    def fit(
        self, X_train, y_train,
        early_stopping: bool = False,
        early_stopping_min_delta: float = 0.001,
        early_stopping_patience: int = 10,
        custom_models: list = None,
        learning_rate: float = 0.05,
        n_estimators: int = 100,
        warm_start: bool = False,
        complexity: bool = False,
        light: bool = True,
        custom_loss_metrics: object = False,
    ):
        """_summary_
        Args:
            X_train (_type_): _description_
            y_train (_type_): _description_
            early_stopping (bool, optional): _description_. Defaults to False.
            early_stopping_min_delta (float, optional): _description_. Defaults to 0.001.
            early_stopping_patience (int, optional): _description_. Defaults to 10.
            custom_models (list, optional): _description_. Defaults to None.
            learning_rate (float, optional): _description_. Defaults to 0.05.
            n_estimators (int, optional): _description_. Defaults to 100.
            warm_start (bool, optional): _description_. Defaults to False.
            complexity (bool, optional): _description_. Defaults to False.
            custom_loss_metrics (object, optional): _description_. Defaults to False.
        Returns:
            _type_: _description_
        """
        if custom_models:
            self._models = custom_models
        self.custom_loss_metrics = custom_loss_metrics
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.early_stopping = early_stopping
        self.early_stopping_min_delta = early_stopping_min_delta
        self.early_stopping_patience = early_stopping_patience
        if custom_models:
            self._models = custom_models
        else:
            if complexity:
                self._models = (DecisionTreeRegressor, LinearRegression, BayesianRidge, KNeighborsRegressor, HistGradientBoostingRegressor,
                                ElasticNet, LassoLars, Lasso, GradientBoostingRegressor, ExtraTreesRegressor,
                                BaggingRegressor, NuSVR, XGBRegressor, SGDRegressor, KernelRidge, MLPRegressor, LGBMRegressor,
                                Ridge, ARDRegression, RANSACRegressor, HuberRegressor, TheilSenRegressor, LassoLarsIC)
            elif light:
                self._models = (LGBMRegressor, ExtraTreesRegressor,
                                BaggingRegressor, RANSACRegressor, LassoLarsIC, BayesianRidge)
            else:
                self._models = (DecisionTreeRegressor, LinearRegression, BayesianRidge, KNeighborsRegressor, LGBMRegressor,
                                ElasticNet, LassoLars, Lasso, SGDRegressor, BaggingRegressor, ExtraTreesRegressor,
                                Ridge, ARDRegression, RANSACRegressor, LassoLarsIC)
        X_train = KNNImputer(weights='distance',
                             n_neighbors=10).fit_transform(deepcopy(X_train))
        self._y_mean = y_train.mean()
        # base model: mean
        # computer residuals: y - y hat
        # for n_estimators: a) y = prev residuals && residuals * learning rate
        # add early stopping
        # restore best weights
        # ada boost and adaptive scaling for learning rates

        preds = DataFrame(
            data={'yt': y_train, 'p0': np.full((len(y_train)), y_train - self._y_mean)})
        residuals = DataFrame(
            data={'r0': y_train - self._y_mean})
        errors = []
        if not early_stopping:
            if warm_start:
                for i in prange(1, self.n_estimators + 1):
                    y = residuals[f'r{i - 1}']
                    results = self._get_results(X_train, y)
                    min_loss = min(results, key=lambda x: x.get(
                        "loss", float('inf')))["loss"]  # https://stackoverflow.com/a/19619294
                    min_model = [i['model']
                                 for i in results if min_loss >= i['loss']][0]
                    preds[f'p{i}'] = residuals.sum(axis=1) + min_model.predict(
                        X_train) * self.learning_rate
                    residuals[f'r{i}'] = preds['yt'] - preds[f'p{i}']
                    if i % 3 == 0:
                        X_train[f"r{i}"] = residuals[f'r{i}'].copy()
                    try:
                        errors.append(mean_squared_error(
                            preds['yt'], preds[f'p{i}']))
                    except Exception:
                        df = concat(
                            [preds['yt'], preds[f'p{i - 1}']], axis=1).dropna()
                        errors.append(mean_squared_error(
                            df['yt'], df[f"p{i - 1}"]))
                    self._ensemble.append(min_model)
            else:
                for i in prange(1, self.n_estimators + 1):
                    y = residuals[f'r{i - 1}']
                    results = self._get_results(X_train, y)
                    min_loss = min(results, key=lambda x: x.get(
                        "loss", float('inf')))["loss"]  # https://stackoverflow.com/a/19619294
                    min_model = [i['model']
                                 for i in results if min_loss >= i['loss']][0]
                    preds[f'p{i}'] = residuals.sum(axis=1) + min_model.predict(
                        X_train) * self.learning_rate
                    residuals[f'r{i}'] = preds['yt'] - preds[f'p{i}']
                    errors.append(mean_squared_error(
                        preds['yt'], preds[f'p{i}']))
                    self._ensemble.append(min_model)
                    if errors[i - 1] == 0:
                        break
        else:
            return "TODO"
        min_error = min(errors)
        min_error_i = [i for i in prange(
            len(errors)) if errors[i] == min_error][0]
        self._ensemble, errors = self._ensemble[:
                                                min_error_i], errors[:min_error_i]
        residuals = residuals[:len(errors)]
        return self._ensemble, (residuals, errors)

    def predict(self, X_test):
        """_summary_
        Args:
            X_test (_type_): _description_
        Returns:
            _type_: _description_
        """
        try:
            val = self._ensemble[0]
        except Exception:
            return "Please train the model first"
        # X_test = self._robust.transform(self._minimax.transform(deepcopy(X_test)))
        preds = DataFrame(
            data={'p0': np.full((len(X_test)), self._y_mean)})
        for i in prange(len(self._ensemble)):
            preds[f"p{i}"] = self._ensemble[i].predict(X_test)
        preds_ = DataFrame(data={'preds': MinMaxScaler().fit_transform(preds.sum(axis=1).values.reshape(-1, 1)).reshape(1, -1)[0]}).values.reshape(1, -1)[0]
        
        @np.vectorize
        def quantize(x):
            if x > 0.5:
                return 1
            else:
                return 0
        return quantize(preds_)

    def score(self, X_test, y_true):
        """
        Args:
            X_test (Iterable)
            y_true (Iterable)
        Returns:
            float: R2 Score for y_true and y_predicted
        """
        return r2_score(y_true, self.predict(X_test))

    def get_params(self):
        return self.__dict__

In [18]:
%%time
X, y = make_classification(n_samples=15000)

CPU times: total: 31.2 ms
Wall time: 26 ms


In [19]:
# @np.vectorize
# def quantize(x):
#     if x > .5: return 0.8
#     else: return .2

In [20]:
# y = quantize(y)

In [21]:
# pd.DataFrame(data={'y': (y)}).values.reshape(1, -1)[0]

In [22]:
from sklearn.preprocessing import MinMaxScaler
# MinMaxScaler().fit_transform(y.reshape(-1, 1)).reshape(1, -1)[0]

In [23]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size=.7, stratify=y
)


CPU times: total: 15.6 ms
Wall time: 8.99 ms


In [24]:
from sklearn.metrics import f1_score

In [25]:
%%time
from xgboost import XGBClassifier
clf = XGBClassifier()
clf.fit(X_train, y_train)
(f1_score(y_val, clf.predict(X_val)), f1_score(y_train, clf.predict(X_train)))

CPU times: total: 9.61 s
Wall time: 1.59 s


(0.8982115257231177, 0.9983832620066572)

In [26]:
%%time
clf = VGBClassifier()
_ = clf.fit(X_train, y_train,)
(f1_score(y_val, clf.predict(X_val)), f1_score(y_train, clf.predict(X_train)))

CPU times: total: 5min 38s
Wall time: 1min 26s


(0.9046261784696338, 0.9773223265964512)

In [27]:
%%time
import VGBoost
clf = VGBoost.VGBClassifier()
_ = clf.fit(X_train, y_train,)
(f1_score(y_val, clf.predict(X_val)), f1_score(y_train, clf.predict(X_train)))

CPU times: total: 5min 34s
Wall time: 1min 24s


(0.9065897492788996, 0.9469890943575154)

In [28]:
%%time
from lightgbm import LGBMClassifier
clf = LGBMClassifier()
clf.fit(X_train, y_train)
(f1_score(y_val, clf.predict(X_val)), f1_score(y_train, clf.predict(X_train)))


CPU times: total: 1.52 s
Wall time: 276 ms


(0.9043824701195219, 0.9653348446207615)

In [29]:
0.9539897755056679 > 0.9545251894783773

False