In [54]:
from huggingface_hub import hf_hub_download
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from hyperopt import hp, tpe, fmin, Trials
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import r2_score, accuracy_score

In [13]:
!set HF_HUB_DISABLE_SYMLINKS_WARNING=true

In [5]:
REPO_ID = "inria-soda/tabular-benchmark"
clf_cat = [
    'albert.csv', 
   'compas-two-years.csv', 
   'covertype.csv', 
   'default-of-credit-card-clients.csv',
   'electricity.csv',
   'eye_movements.csv',
   'road-safety.csv'
]

clf_num = [
    'Bioresponse.csv',
    'Diabetes130US.csv',
    'Higgs.csv',
    'MagicTelescope.csv',
    'MiniBooNE.csv',
    'bank-marketing.csv',
    'california.csv',
    'covertype.csv',
    'credit.csv',
    'default-of-credit-card-clients.csv',
    'electricity.csv',
    'eye_movements.csv',
    'heloc.csv',
    'house_16H.csv',
    'jannis.csv',
    'pol.csv'
]

reg_cat = [
    'Airlines_DepDelay_1M.csv',
    'Allstate_Claims_Severity.csv',
    'Bike_Sharing_Demand.csv',
    'Brazilian houses.csv',
    'Mercedes_Benz_Greener_Manufacturing.csv',
    'SGEMM_GPU_kernel_performance.csv',
    'abalone.csv',
    'analcatdata_supreme.csv',
    'delays_zurich_transport.csv',
    'diamonds.csv',
    'house_sales.csv',
    'medical_charges.csv',
    'nyc-taxi-green-dec-2016.csv',
    'particulate-matter-ukair-2017.csv',
    'seattlecrime6.csv',
    'topo_2_1.csv',
    'visualizing_soil.csv'
]

reg_num = [
    'Ailerons.csv',
    'Bike_Sharing_Demand.csv',
    'Brazilian houses.csv',
    'MiamiHousing2016.csv',
    'abalone.csv',
    'cpu_act.csv',
    'delays_zurich_transport.csv',
    'diamonds.csv',
    'elevators.csv',
    'house_16H.csv',
    'house_sales.csv',
    'houses.csv',
    'medical_charges.csv',
    'nyc-taxi-green-dec-2016.csv',
    'pol.csv',
    'sulfur.csv',
    'superconduct.csv',
    'wine_quality.csv',
    'yprop_4_1.csv'
]

In [17]:
def preprocess_data(df):
    df = df.copy()
    for col in df.select_dtypes(include='object').columns:
        if df[col].nunique() > 20:
            df = df.drop(col, axis=1)
    
    # Removing numerical features with less than 10 unique values
    for col in df.select_dtypes(include=['int', 'float']).columns:
        if df[col].nunique() < 10:
            df.drop(col, axis=1, inplace=True)
        elif df[col].nunique() == 2:  # Converting numerical features with 2 unique values to categorical
            df[col] = df[col].astype('category')

    return df

def split_data(df, isCategorical, training_size=0.8):
    if isCategorical and df.iloc[:, -1].nunique() > 2:
        # Get the two most numerous classes
        top_classes = df.iloc[:, -1].value_counts().nlargest(2).index.tolist()
        
        # Keep half of samples in each class
        df = df[df.iloc[:, -1].isin(top_classes)]
        df = df.groupby(df.columns[-1]).head(len(df) // 2)
        
    if len(df) > 10000:
        train_data = df.sample(n=10000, random_state=42)
        temp_data = df.drop(train_data.index)
    else :
        train_data, temp_data = train_test_split(df, test_size=1-training_size, random_state=42)
    
    # Further split temp data into test and val data
    test_data, val_data = train_test_split(temp_data, test_size=0.5, random_state=42)
    
    return train_data, test_data, val_data

def is_heavy_tailed(data):
    """
    Checks if the distribution of the given data is heavy-tailed.
    
    Parameters:
    data (array-like): The data for which to check the tail behavior.
    
    Returns:
    bool: True if the distribution is heavy-tailed, False otherwise.
    """
    skewness = skew(data)
    kurt = kurtosis(data)
    
    # Thresholds for skewness and kurtosis to determine if distribution is heavy-tailed
    skew_threshold = 0
    kurtosis_threshold = 3
    
    return skewness > skew_threshold or kurt > kurtosis_threshold

def plot_distribution(data):
    """
    Plots the distribution of the given data.
    
    Parameters:
    data (array-like): The data to be plotted.
    """
    plt.figure(figsize=(8, 5))
    plt.hist(data, bins=30, color='blue', alpha=0.7)
    plt.title('Distribution Plot')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.show()
    
def affine_renormalization_classification(results):
    """
    Perform affine renormalization on classification results.
    
    Parameters:
    results (list): List of original classification results between 0 and 1.
    
    Returns:
    list: List of renormalized classification results between 0 and 1.
    """
    # Find the top-performing model's accuracy
    top_accuracy = max(results)
    
    # Find the accuracy corresponding to the 10th percentile
    quantile_accuracy = np.percentile(results, 10)
    
    # Calculate the range of accuracies for renormalization
    range_accuracy = top_accuracy - quantile_accuracy
    
    # Perform affine renormalization for each accuracy
    renormalized_results = [(accuracy - quantile_accuracy) / range_accuracy for accuracy in results]
    
    return renormalized_results

In [40]:
df = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename=f'reg_cat/{reg_cat[0]}', repo_type="dataset")
)
df

reg_cat/Airlines_DepDelay_1M.csv:   0%|          | 0.00/41.0M [00:00<?, ?B/s]

Unnamed: 0,Month,DayofMonth,CRSDepTime,CRSArrTime,Distance,DepDelay
0,10,11,1300.0,1535.0,2556.0,2.197225
1,10,10,2035.0,2110.0,100.0,-1.386294
2,10,26,1200.0,1446.0,2475.0,1.945910
3,10,9,1145.0,1512.0,2586.0,0.693147
4,10,16,930.0,1149.0,2399.0,0.000000
...,...,...,...,...,...,...
999995,7,30,835.0,940.0,317.0,-0.693147
999996,7,29,1225.0,1633.0,843.0,0.693147
999997,7,30,1515.0,1735.0,350.0,-1.098612
999998,7,25,1335.0,1646.0,900.0,2.197225


In [42]:
preprocessed_data = preprocess_data(df.iloc[:, :-1])
preprocessed_data = pd.concat((preprocessed_data, df.iloc[:, -1]), axis=1)
preprocessed_data

Unnamed: 0,Month,DayofMonth,CRSDepTime,CRSArrTime,Distance,DepDelay
0,10,11,1300.0,1535.0,2556.0,2.197225
1,10,10,2035.0,2110.0,100.0,-1.386294
2,10,26,1200.0,1446.0,2475.0,1.945910
3,10,9,1145.0,1512.0,2586.0,0.693147
4,10,16,930.0,1149.0,2399.0,0.000000
...,...,...,...,...,...,...
999995,7,30,835.0,940.0,317.0,-0.693147
999996,7,29,1225.0,1633.0,843.0,0.693147
999997,7,30,1515.0,1735.0,350.0,-1.098612
999998,7,25,1335.0,1646.0,900.0,2.197225


In [43]:
train_data, test_data, val_data = split_data(preprocessed_data, True)
print(f'Training Data : {train_data.shape}')
print(f'Testing Data : {test_data.shape}')
print(f'Validation Data : {val_data.shape}')

Training Data : (10000, 6)
Testing Data : (99650, 6)
Validation Data : (99650, 6)


In [None]:
clf_cat_random_forest_scores = []
for dataset in clf_cat :
    print(f'Evaluating {dataset}...')
    
    df = pd.read_csv(
        hf_hub_download(repo_id=REPO_ID, filename=f'clf_cat/{dataset}', repo_type="dataset")
    )
    preprocessed_df = preprocess_data(df.iloc[:, :-1])
    preprocessed_df = pd.concat((preprocessed_df, df.iloc[:, -1]), axis=1)
    train_data, test_data, val_data = split_data(preprocessed_df, isCategorical=True)
    
    hyperparameter_space = {
        'n_estimators': hp.choice('n_estimators', np.arange(10, 1000, dtype=int)),
        'max_depth': hp.choice('max_depth', np.arange(1, 20, dtype=int)),
        'min_samples_split': hp.uniform('min_samples_split', 0.1, 1.0),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0.1, 0.5)
    }
    
    def objective_function(hyperparameters):
        global iteration_n
        if iteration_n == 0:
            model = RandomForestClassifier()
        else :
            model = RandomForestClassifier(**hyperparameters)
        iteration_n += 1
        # score = -np.mean(cross_val_score(model, train_data.iloc[:, :-1], train_data.iloc[:, -1], cv=5, scoring='accuracy'))
        model.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])
        y_predict = model.predict(val_data.iloc[:, :-1])
        score = -accuracy_score(val_data.iloc[:, -1], y_predict)

        return score
    
    trials = Trials()
    iteration_n = 0
    best_hyperparameters = fmin(objective_function, hyperparameter_space, algo=tpe.suggest, max_evals=100, trials=trials)
    clf_cat_random_forest_scores.append(-1 * np.array(trials.losses()))
    

Evaluating albert.csv...
100%|█████████████████████████████████████████████| 100/100 [04:05<00:00,  2.46s/trial, best loss: -0.6471027107684656]
Evaluating compas-two-years.csv...
100%|█████████████████████████████████████████████| 100/100 [01:11<00:00,  1.39trial/s, best loss: -0.6639839034205232]
Evaluating covertype.csv...
  8%|███▊                                            | 8/100 [00:19<04:17,  2.80s/trial, best loss: -0.828490620769677]