In [30]:
from huggingface_hub import hf_hub_download
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from hyperopt import hp, tpe, fmin, Trials
from hyperopt.pyll.base import scope
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
!set HF_HUB_DISABLE_SYMLINKS_WARNING=true

In [3]:
REPO_ID = "inria-soda/tabular-benchmark"
clf_cat = [
    'albert.csv', 
   'compas-two-years.csv', 
   'covertype.csv', 
   'default-of-credit-card-clients.csv',
   'electricity.csv',
   'eye_movements.csv',
   'road-safety.csv'
]

clf_num = [
    'Bioresponse.csv',
    'Diabetes130US.csv',
    'Higgs.csv',
    'MagicTelescope.csv',
    'MiniBooNE.csv',
    'bank-marketing.csv',
    'california.csv',
    'covertype.csv',
    'credit.csv',
    'default-of-credit-card-clients.csv',
    'electricity.csv',
    'eye_movements.csv',
    'heloc.csv',
    'house_16H.csv',
    'jannis.csv',
    'pol.csv'
]

reg_cat = [
    'Airlines_DepDelay_1M.csv',
    'Allstate_Claims_Severity.csv',
    'Bike_Sharing_Demand.csv',
    'Brazilian houses.csv',
    'Mercedes_Benz_Greener_Manufacturing.csv',
    'SGEMM_GPU_kernel_performance.csv',
    'abalone.csv',
    'analcatdata_supreme.csv',
    'delays_zurich_transport.csv',
    'diamonds.csv',
    'house_sales.csv',
    'medical_charges.csv',
    'nyc-taxi-green-dec-2016.csv',
    'particulate-matter-ukair-2017.csv',
    'seattlecrime6.csv',
    'topo_2_1.csv',
    'visualizing_soil.csv'
]

reg_num = [
    'Ailerons.csv',
    'Bike_Sharing_Demand.csv',
    'Brazilian houses.csv',
    'MiamiHousing2016.csv',
    'abalone.csv',
    'cpu_act.csv',
    'delays_zurich_transport.csv',
    'diamonds.csv',
    'elevators.csv',
    'house_16H.csv',
    'house_sales.csv',
    'houses.csv',
    'medical_charges.csv',
    'nyc-taxi-green-dec-2016.csv',
    'pol.csv',
    'sulfur.csv',
    'superconduct.csv',
    'wine_quality.csv',
    'yprop_4_1.csv'
]

In [31]:
def remove_pseudo_categorical(X, y):
    """Remove columns where most values are the same"""
    pseudo_categorical_cols_mask = X.nunique() < 10
    print("Removed {} columns with pseudo-categorical values on {} columns".format(sum(pseudo_categorical_cols_mask),
                                                                                   X.shape[1]))
    X = X.drop(X.columns[pseudo_categorical_cols_mask], axis=1)
    return X, y

def remove_rows_with_missing_values(X, y):
    missing_rows_mask = pd.isnull(X).any(axis=1)
    print("Removed {} rows with missing values on {} rows".format(sum(missing_rows_mask), X.shape[0]))
    X = X[~missing_rows_mask]
    y = y[~missing_rows_mask]
    
    return X, y

def remove_missing_values(X, y, threshold=0.7):
    """Remove columns where most values are missing, then remove any row with missing values"""
    missing_cols_mask = pd.isnull(X).mean(axis=0) > threshold
    print("Removed {} columns with missing values on {} columns".format(sum(missing_cols_mask), X.shape[1]))
    X = X.drop(X.columns[missing_cols_mask], axis=1)
    missing_rows_mask = pd.isnull(X).any(axis=1)
    print("Removed {} rows with missing values on {} rows".format(sum(missing_rows_mask), X.shape[0]))
    X = X[~missing_rows_mask]
    y = y[~missing_rows_mask]
    
    return X, y
    
def remove_high_cardinality(X, y, categorical_mask, threshold=20):
    high_cardinality_mask = (X.nunique() > threshold).values
    print("high cardinality columns: {}".format(X.columns[high_cardinality_mask * categorical_mask]))
    n_high_cardinality = sum(categorical_mask * high_cardinality_mask)
    X = X.drop(X.columns[categorical_mask * high_cardinality_mask], axis=1)
    print("Removed {} high-cardinality categorical features".format(n_high_cardinality))
    categorical_mask = [categorical_mask[i] for i in range(len(categorical_mask)) if not (high_cardinality_mask[i] and categorical_mask[i])]

    return X, y

def balance(X, y) :
    freq_count = y.value_counts().sort(ascending=False)
    X = X[y in freq_count.index[:2]]
    y = y[y in freq_count.index[:2]]
    
def transform_target(y, keyword):
    if keyword == "log":
        return np.sign(y) * np.log(1 + np.abs(y))
    elif keyword == "none":
        return y
    elif pd.isnull(keyword):
        return y
    
def is_heavy_tailed(data):
    """
    Checks if the distribution of the given data is heavy-tailed.
    
    Parameters:
    data (array-like): The data for which to check the tail behavior.
    
    Returns:
    bool: True if the distribution is heavy-tailed, False otherwise.
    """
    skewness = skew(data)
    kurt = kurtosis(data)
    
    # Thresholds for skewness and kurtosis to determine if distribution is heavy-tailed
    skew_threshold = 0
    kurtosis_threshold = 3
    
    return skewness > skew_threshold or kurt > kurtosis_threshold
    
    
def preprocess_data(X, y, isCategorical=False):
    # df = df.copy()
#     for col in df.select_dtypes(include='object').columns:
#         if df[col].nunique() > 20:
#             df = df.drop(col, axis=1)
    
#     # Removing numerical features with less than 10 unique values
#     for col in df.select_dtypes(include=['int', 'float']).columns:
#         if df[col].nunique() < 10:
#             df.drop(col, axis=1, inplace=True)
#         elif df[col].nunique() == 2:  # Converting numerical features with 2 unique values to categorical
#             df[col] = df[col].astype('category')

    if isCategorical :
        le = LabelEncoder()
        y = le.fit_transform(y)
    elif is_heavy_tailed(y) :
        y = transform_target(y, keyword='log')


    X, y = remove_rows_with_missing_values(X, y)
    X, y = remove_missing_values(X, y)
    X, y = remove_pseudo_categorical(X, y)
    categorical_mask = [(X[col].dtype == 'object' or len(X[col].unique()) < 20) for col in X.columns]
    X, y = remove_high_cardinality(X, y, categorical_mask)
    
    return X, y

def split_data(X, y, isCategorical=None):
    # Shuffle the data
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    try :
        X = X.values
    except :
        pass
    
    try :
        y = y.values
    except :
        pass
    X = X[indices]
    y = y[indices]
    
    stratify = None if not isCategorical else y
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, random_state=42, stratify=stratify)
    stratify = None if not isCategorical else y_temp
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.7, random_state=42, stratify=stratify)
    
    num_val = min(X_val.shape[0], 50000)
    num_test = min(X_test.shape[0], 50000)
    
    X_val, y_val = X_val[:num_val], y_val[:num_val]
    X_test, y_test = X_test[:num_val], y_test[:num_test]

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


def plot_distribution(data):
    """
    Plots the distribution of the given data.
    
    Parameters:
    data (array-like): The data to be plotted.
    """
    plt.figure(figsize=(8, 5))
    plt.hist(data, bins=30, color='blue', alpha=0.7)
    plt.title('Distribution Plot')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.show()
    
def affine_renormalization_classification(results):
    """
    Perform affine renormalization on classification results.
    
    Parameters:
    results (list): List of original classification results between 0 and 1.
    
    Returns:
    list: List of renormalized classification results between 0 and 1.
    """
    # Find the top-performing model's accuracy
    top_accuracy = max(results)
    
    # Find the accuracy corresponding to the 10th percentile
    quantile_accuracy = np.percentile(results, 10)
    
    # Calculate the range of accuracies for renormalization
    range_accuracy = top_accuracy - quantile_accuracy
    
    # Perform affine renormalization for each accuracy
    renormalized_results = [(accuracy - quantile_accuracy) / range_accuracy for accuracy in results]
    
    return renormalized_results

In [5]:
df = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename=f'reg_cat/{reg_cat[0]}', repo_type="dataset")
)
df

Unnamed: 0,Month,DayofMonth,CRSDepTime,CRSArrTime,Distance,DepDelay
0,10,11,1300.0,1535.0,2556.0,2.197225
1,10,10,2035.0,2110.0,100.0,-1.386294
2,10,26,1200.0,1446.0,2475.0,1.945910
3,10,9,1145.0,1512.0,2586.0,0.693147
4,10,16,930.0,1149.0,2399.0,0.000000
...,...,...,...,...,...,...
999995,7,30,835.0,940.0,317.0,-0.693147
999996,7,29,1225.0,1633.0,843.0,0.693147
999997,7,30,1515.0,1735.0,350.0,-1.098612
999998,7,25,1335.0,1646.0,900.0,2.197225


In [41]:
clf_cat_random_forest_scores = []
clf_cat_gradient_boosting_scores = []
for dataset in clf_cat :
    print(f'Evaluating {dataset}...')
    
    df = pd.read_csv(
        hf_hub_download(repo_id=REPO_ID, filename=f'clf_cat/{dataset}', repo_type="dataset")
    )
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    X, y = preprocess_data(X, y, isCategorical=True)
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = split_data(X, y, isCategorical=True)
    for idx, iter_ in enumerate(range(15)) :
        
        #Random Forest
        hyperparameter_space = {
            'criterion': hp.choice('criterion', ['gini', 'entropy']),
            'n_estimators': scope.int(hp.loguniform('n_estimators', np.log(9.5), np.log(3000.5))),
            'max_depth': hp.choice('max_depth', [None, 2, 3, 4]),
            'min_samples_split': hp.choice('min_samples_split', [2, 3]),
            'min_samples_leaf': scope.int(hp.loguniform('min_samples_leaf', np.log(1.5), np.log(50.5))),
            'bootstrap': hp.choice('bootstrap', [True, False]),
            'max_features': hp.choice('max_features', ['sqrt', 'sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
            'min_impurity_decrease': hp.choice('min_impurity_decrease', [0.0, 0.01, 0.02, 0.05])
        }

        def objective_function(hyperparameters):
            global iteration_n
            if iteration_n == 0:
                model = RandomForestClassifier()
            else :
                model = RandomForestClassifier(**hyperparameters)
            iteration_n += 1
            # score = -np.mean(cross_val_score(model, train_data.iloc[:, :-1], train_data.iloc[:, -1], cv=5, scoring='accuracy'))
            model.fit(X_train, y_train)
            y_predict = model.predict(X_val)
            score = -accuracy_score(y_val, y_predict)

            return score

        trials = Trials()
        iteration_n = 0
        best_hyperparameters = fmin(objective_function, hyperparameter_space, algo=tpe.suggest, max_evals=100, trials=trials)
        scores_round = -1 * np.array(trials.losses())
        print(f'Best accuracy : {scores_round.max()}')
        
        clf_cat_random_forest_scores.append([scores_round.tolist()])
        
    
np.save('clf_cat_random_forest_scores.npy', clf_cat_random_forest_scores)

Evaluating albert.csv...
Removed 0 rows with missing values on 58252 rows
Removed 0 columns with missing values on 31 columns
Removed 0 rows with missing values on 58252 rows
Removed 7 columns with pseudo-categorical values on 31 columns
high cardinality columns: Index([], dtype='object')
Removed 0 high-cardinality categorical features
 16%|███████▏                                    | 65/400 [14:55<1:16:56, 13.78s/trial, best loss: -0.6501335368180083]



KeyboardInterrupt



In [40]:
reg_cat_random_forest_scores = []
# reg_cat_gradient_boosting_scores = []
for dataset in reg_cat :
    print(f'Evaluating {dataset}...')
    
    df = pd.read_csv(
        hf_hub_download(repo_id=REPO_ID, filename=f'reg_cat/{dataset}', repo_type="dataset")
    )
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    X, y = preprocess_data(X, y, isCategorical=True)
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = split_data(X, y, isCategorical=True)
    for idx, iter_ in enumerate(range(15)) :
        hyperparameter_space = {
            'criterion', hp.choice('criterion', ['squared_error', 'absolute_error']),
            'n_estimators': scope.int(hp.loguniform('n_estimators', np.log(9.5), np.log(3000.5))),
            'max_depth': hp.choice('max_depth', [None, 2, 3, 4]),
            'min_samples_split': hp.choice('min_samples_split', [2, 3]),
            'min_samples_leaf': scope.int(hp.loguniform('min_samples_leaf', np.log(1.5), np.log(50.5))),
            'bootstrap': hp.choice('bootstrap', [True, False]),
            'max_features': hp.choice('max_features', ['sqrt', 'sqrt', 'log2', None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
            'min_impurity_decrease': hp.choice('min_impurity_decrease', [0.0, 0.01, 0.02, 0.05])
        }

        def objective_function(hyperparameters):
            global iteration_n
            if iteration_n == 0:
                model = RandomForestRegressor()
            else :
                model = RandomForestRegressor(**hyperparameters)
            iteration_n += 1
            # score = -np.mean(cross_val_score(model, train_data.iloc[:, :-1], train_data.iloc[:, -1], cv=5, scoring='accuracy'))
            model.fit(X_train, y_train)
            y_predict = model.predict(X_val)
            score = -r2_score(y_val, y_predict)

            return score

        trials = Trials()
        iteration_n = 0
        best_hyperparameters = fmin(objective_function, hyperparameter_space, algo=tpe.suggest, max_evals=100, trials=trials)
        scores_round = -1 * np.array(trials.losses())
        print(f'Best accuracy : {scores_round.max()}')
        
        reg_cat_random_forest_scores.append([scores_round.tolist()])
        
np.save('reg_cat_random_forest_scores.npy', reg_cat_random_forest_scores)    

[[[0.652422739412438, 0.6327737504769172]]]