In [7]:
from sklearn.datasets import make_friedman3, make_regression, make_friedman2, fetch_california_housing, load_diabetes, make_friedman1, make_sparse_uncorrelated, make_classification
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn import datasets
from sklearn.metrics import log_loss, f1_score, auc, roc_auc_score, mean_squared_log_error
import numpy as np
import random
import cython
%load_ext Cython
np.random.seed(7)
random.seed(7)

In [18]:
%%cython
import numpy as np
from MSBoost import MSBoostClassifier
from sklearn.datasets import make_friedman3, make_regression, make_friedman2, fetch_california_housing, load_diabetes, make_friedman1, make_sparse_uncorrelated, make_classification
cimport numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn import datasets
from sklearn.metrics import log_loss, f1_score, auc, roc_auc_score, mean_squared_log_error

def subsample(X, y, num_samples=1000, random_state=7):
    """
    Subsample the arrays X and y.

    Parameters:
    - X: Input array
    - y: Target array
    - num_samples: Number of samples to retain
    - random_state: Seed for random number generator (default is None)

    Returns:
    - Subsampled X and y arrays
    """
    if len(y) <= num_samples:
        return X, y
        
    if random_state is not None:
        np.random.seed(random_state)

    indices = np.random.choice(len(y), num_samples, replace=False)

    return X[indices], y[indices]

def get_card_split(df, cols, n=11):
    """
    Splits categorical columns into 2 lists based on cardinality (i.e # of unique values)
    Parameters (Source: https://github.com/shankarpandala/lazypredict/blob/dev/lazypredict/Supervised.py#L114)
    ----------
    df : Pandas DataFrame
        DataFrame from which the cardinality of the columns is calculated.
    cols : list-like
        Categorical columns to list
    n : int, optional (default=11)
        The value of 'n' will be used to split columns.
    Returns
    -------
    card_low : list-like
        Columns with cardinality < n
    card_high : list-like
        Columns with cardinality >= n
        
    """
    cond = df[cols].nunique() > n
    card_high = cols[cond]
    card_low = cols[~cond]
    return card_low, card_high

def append_row(df, data):
    """
    Append a row of data to a DataFrame.

    Parameters :
        - df (pandas.DataFrame): The DataFrame to which the row will be appended.
        - data (list): The data representing a row to be appended. Should be a list where each element corresponds to a column in the DataFrame.

    Returns:
        None
    """
    df.loc[len(df)] = data
    
import os
import contextlib
import sys

@contextlib.contextmanager
def suppress_stdout():
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout
            
def score_clf(model, X, y):
    return log_loss(y, model.predict_proba(X))

def pre_process_y(y):
    return LabelEncoder().fit_transform(y)

def cv_evaluate(model, X, y, scoring="neg_mean_squared_error", n_jobs=1):

    sys.stdout = open(os.devnull, 'w')
    sys.stderr = open(os.devnull, 'w')
    reg = model(n_estimators=100)
    n_folds = 3
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=7)
    with suppress_stdout():
        cv_results = cross_val_score(reg, X, y, cv=kf, scoring=scoring, n_jobs=n_jobs)
    mean_mse = np.mean(cv_results)
    std_mse = np.std(cv_results)
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__
    return f"{mean_mse:.4f} Â± {std_mse:.4f}"

X, y = make_classification(n_samples=1000, n_features=5, n_informative=5, n_redundant=0, n_repeated=0, random_state=7)
print("MSBoost Clf:", cv_evaluate(MSBoostClassifier, X, y, score_clf))
print("GBDT:", cv_evaluate(GradientBoostingClassifier, X, y, score_clf))