In [6]:
# GOAL: run all methods after fixing some comformal procedures
# standard import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# sklearn
from sklearn.linear_model import (
    RidgeCV,
    LassoCV,
    ElasticNetCV,
    QuantileRegressor
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
)

# miscilaneous models
from xgboost import XGBRegressor
from quantile_forest import RandomForestQuantileRegressor
from joblib import Parallel, delayed
import os
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# methods
from pcs_UQ import PCS_UQ
from gamma_algo import *
from conformal_UQ import *
from gamma_algo import *

In [7]:
gamma_params = {
    "selection_mode": "multiplicative",
    "fit_mode": "vanilla",
    "threshold": None,
    "clip_mode": "no_scale"
    }

# Models with groups
models = {
            "linear": {"Ridge": RidgeCV(),
                       "Lasso": LassoCV(max_iter=5000, random_state=777),
                       "ElasticNet": ElasticNetCV(max_iter=5000, random_state=777)},
            "bagging": {"ExtraTrees": ExtraTreesRegressor(min_samples_leaf = 5, max_features = 0.33, n_estimators = 100, random_state=777),
                        "RandomForest": RandomForestRegressor(min_samples_leaf = 5, max_features = 0.33, n_estimators = 100, random_state=777)},
            "boosting": {"XGBoost": XGBRegressor(random_state=777),
                         "AdaBoost": AdaBoostRegressor(random_state=777)}
        }

# Models with just names for majority vote
models_flat = {"Ridge": RidgeCV(),
                "Lasso": LassoCV(max_iter=5000, random_state=777),
                "ElasticNet": ElasticNetCV(max_iter=5000, random_state=777),
                "ExtraTrees": ExtraTreesRegressor(min_samples_leaf = 5, max_features = 0.33, n_estimators = 100, random_state=777),
                "RandomForest": RandomForestRegressor(min_samples_leaf = 5, max_features = 0.33, n_estimators = 100, random_state=777),
                "XGBoost": XGBRegressor(random_state=777),
                "AdaBoost": AdaBoostRegressor(random_state=777)}

In [8]:
# Load and preprocess data
data = pd.read_csv('Jan_0529_raw.csv')
data.drop(columns=data.columns[:2], inplace=True)
data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
data['start'] = pd.to_datetime(data['start'])
data['end'] = pd.to_datetime(data['end'])

def select_data_within_range(df, start_column, start_date, end_date):
    return df[(df[start_column] >= pd.to_datetime(start_date)) & (df[start_column] <= pd.to_datetime(end_date))]

train_start, train_end = "2022-01-01 00:00:00", "2022-01-12 22:30:00"
eval_start, eval_end = "2022-01-12 22:30:00", "2022-01-31 23:59:00"

training_data = select_data_within_range(data, 'start', train_start, train_end)

features = training_data[['prev_tput', 'prev_rtt', 'prev_retx_rate', 'interval', 'prev_size', 'size_ratio', 'size_bin', 'ip_subnet', 'overlap']]
encoded_features = pd.get_dummies(features, columns=['size_bin', 'ip_subnet'], drop_first=True)
target = np.log(training_data['tput']).values.ravel()

bin_df = features[['ip_subnet', 'size_bin', 'overlap']].copy()

X_train_val, X_test, y_train_val, y_test, bin_train_val, bin_test = train_test_split(
    encoded_features, target, bin_df, test_size=0.2, random_state=77)

X_train, X_val, y_train, y_val, bin_train, bin_val = train_test_split(
    X_train_val, y_train_val, bin_train_val, test_size=0.4, random_state=77)

In [12]:
# ========================
# Run all UQ methods + subgroup PCS
# ========================
def run_all_methods_with_subgroup():
    gamma_params = {
        "selection_mode": "multiplicative",
        "fit_mode": "vanilla",
        "threshold": None,
        "clip_mode": "no_scale"
    }
    
    # --- PCS UQ ---
    pcs = PCS_UQ(models)
    pcs.train(X_train, y_train, X_val, y_val, n_boot=20, file_name='Jan_0529')
    pcs.calibrate(x_val=X_val, y_val=y_val, gamma_params=gamma_params, best="all")
    pcs.predict(x_test=X_test)
    pcs_results = pcs.evaluate(y_test)
    pcs_results['method'] = 'PCS'
    pcs_subgroup_results = pcs.evaluate_subgroups(y_test, bin_test['size_bin'])
    pcs_subgroup_results['method'] = 'PCS'
    pcs_subgroup_results['feature'] = 'size_bin'

    # --- PCS UQ with subgroup calibration ---
    pcs_sg = PCS_UQ(models)
    pcs_sg.train(X_train, y_train, X_val, y_val, n_boot=20, file_name='Jan_0529')
    pcs_sg.calibrate_by_subgroup(X_val, y_val, bin_val['size_bin'], gamma_params)
    pcs_sg.predict_by_subgroup(X_test, bin_test['size_bin'])
    pcs_results_sg = pcs_sg.evaluate(y_test)
    pcs_results_sg['method'] = 'PCS (subgroup cal)'
    pcs_sg_subgroup_results = pcs_sg.evaluate_subgroups(y_test, bin_test['size_bin'])
    pcs_sg_subgroup_results['method'] = 'PCS (subgroup cal)'
    pcs_sg_subgroup_results['feature'] = 'size_bin'

    # --- Vanilla Conformal ---
    conformal = Conformal_UQ(model=clone(models_flat['RandomForest']))
    conformal.train_predict(X_train, y_train, X_val, y_val, X_test, alpha=0.1)
    conformal_results = conformal.evaluate(y_test)
    conformal_results['method'] = 'Vanilla Conformal'
    conformal_subgroup_results = conformal.evaluate_subgroups(y_test, bin_test['size_bin'])
    conformal_subgroup_results['method'] = 'Vanilla Conformal'
    conformal_subgroup_results['feature'] = 'size_bin'

    # --- Locally Weighted ---
    conformal_lw = Conformal_Locally_Weighted(mean_model=clone(models_flat['RandomForest']),
                                              sd_model=clone(models_flat['RandomForest']))
    conformal_lw.train_predict(X_train, y_train, X_val, y_val, X_test, alpha=0.1)
    lw_results = conformal_lw.evaluate(y_test)
    lw_results['method'] = 'Locally Weighted Conformal'
    lw_subgroup_results = conformal_lw.evaluate_subgroups(y_test, bin_test['size_bin'])
    lw_subgroup_results['method'] = 'Locally Weighted Conformal'
    lw_subgroup_results['feature'] = 'size_bin'

    # --- Quantile Regression ---
    qr = Conformal_Quantiles(model=QuantileRegressor())
    qr.train_predict(X_train, y_train, X_val, y_val, X_test, alpha=0.1)
    qr_results = qr.evaluate(y_test)
    qr_results['method'] = 'Quantile Regression Conformal'
    qr_subgroup_results = qr.evaluate_subgroups(y_test, bin_test['size_bin'])
    qr_subgroup_results['method'] = 'Quantile Regression Conformal'
    qr_subgroup_results['feature'] = 'size_bin'

    # --- Quantile Forest ---
    qf = Conformal_Quantiles(model=RandomForestQuantileRegressor())
    qf.train_predict(X_train, y_train, X_val, y_val, X_test, alpha=0.1)
    qf_results = qf.evaluate(y_test)
    qf_results['method'] = 'Quantile Forest Conformal'
    qf_subgroup_results = qf.evaluate_subgroups(y_test, bin_test['size_bin'])
    qf_subgroup_results['method'] = 'Quantile Forest Conformal'
    qf_subgroup_results['feature'] = 'size_bin'

    # --- Majority Vote ---
    vote = Conformal_Majority_Vote(models=models_flat)
    vote.train_predict(X_train, y_train, X_val, y_val, X_test, alpha=0.1)
    vote_results = vote.evaluate(y_test)
    vote_results['method'] = 'Majority Vote Conformal'
    vote_subgroup_results = vote.evaluate_subgroups(y_test, bin_test['size_bin'])
    vote_subgroup_results['method'] = 'Majority Vote Conformal'
    vote_subgroup_results['feature'] = 'size_bin'

    # Combine overall results
    overall = pd.concat([pcs_results, pcs_results_sg, conformal_results, lw_results, qr_results, qf_results, vote_results], ignore_index=True)
    subgroup = pd.concat([pcs_subgroup_results, pcs_sg_subgroup_results, conformal_subgroup_results, lw_subgroup_results,
                          qr_subgroup_results, qf_subgroup_results, vote_subgroup_results], ignore_index=True)
    return overall, subgroup


In [None]:
# Run
overall_df, subgroup_df = run_all_methods_with_subgroup()
overall_df.to_csv("uq_overall_with_subgroup.csv", index=False)
subgroup_df.to_csv("uq_subgroup_results.csv", index=False)


In [None]:
overall_df

In [None]:
subgroup_df