In [7]:
# standard import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import pyreadr

# sklearn
from sklearn.linear_model import (
    LinearRegression,
    RidgeCV,
    LassoCV,
    ElasticNetCV,
    # HuberRegressor,
    # QuantileRegressor,
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
)
from sklearn.utils import resample
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.datasets import make_regression
from sklearn import datasets

# miscilaneous models
from xgboost import XGBRegressor

# from quantile_forest import RandomForestQuantileRegressor
# from imodels import get_clean_dataset
from joblib import Parallel, delayed

# from exp_utils import *

# from methods import *
import time
from scipy.stats import multivariate_normal
import pickle
import blosc
import os

from gamma_algo import *
import itertools
from copy import deepcopy
import warnings

In [8]:
# methods
from pcs_UQ import PCS_UQ

In [9]:
models = {
            "linear": {"Ridge": RidgeCV(), 
                       "Lasso": LassoCV(max_iter=5000, random_state=777), 
                       "ElasticNet": ElasticNetCV(max_iter=5000, random_state=777)},
            "bagging": {"ExtraTrees": ExtraTreesRegressor(min_samples_leaf = 5, max_features = 0.33, n_estimators = 100, random_state=777), 
                        "RandomForest": RandomForestRegressor(min_samples_leaf = 5, max_features = 0.33, n_estimators = 100, random_state=777)},
            "boosting": {"XGBoost": XGBRegressor(random_state=777), 
                         "AdaBoost": AdaBoostRegressor(random_state=777)}
        }

In [10]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

---

#### testing methods

In [11]:
X, y = make_regression(n_samples=700, n_features=5, noise=0.4, random_state=42)

X_train_val, X_test, y_train_val, y_test = (
    train_test_split(X, y, test_size=0.2, random_state=77)
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.4, random_state=77
)


In [15]:
gamma_params = {
    "selection_mode": "multiplicative",
    "fit_mode": "vanilla",
    "threshold": 0.1,
    "clip_mode": "no_scale"
    }

pcs_uq = PCS_UQ(models)

results = pcs_uq.train(
    x_train=X_train, 
    x_val=X_val, 
    y_train=y_train, 
    y_val=y_val, 
    file_name='test', 
)

Evaluation complete, here's the result
  model_group         model        rmse        mae        r2
0      linear         Ridge    0.415525   0.330065  0.999993
1      linear         Lasso    0.492071   0.395093  0.999990
2      linear    ElasticNet   14.514980  11.390004  0.991077
3     bagging    ExtraTrees  121.693175  94.529954  0.372804
4     bagging  RandomForest   74.845813  56.151688  0.762750
5    boosting       XGBoost   50.101189  38.283539  0.893692
6    boosting      AdaBoost   66.037484  50.358840  0.815306
Bootstrapping


In [16]:
val_results_df = pcs_uq.calibrate(
    x_val=X_val,
    y_val=y_val,
    gamma_params=gamma_params,
    best="all"
)
test_results_df = pcs_uq.predict(x_test=X_test)
pcs_evaluation_results = pcs_uq.evaluate(y_test=y_test)

processing data


In [17]:
pcs_evaluation_results

Unnamed: 0,coverage,avg_length,median_length,range_y_test,alpha,scaled_avg_length,scaled_median_length
0,0.842857,41.902847,38.773972,810.111366,0.1,0.051725,0.047863


---

#### testing on networking data

In [18]:
data = pd.read_csv('Jan_0529_raw.csv')
data.drop(columns=data.columns[:2], axis=1, inplace=True)
data.rename(columns = {'Unnamed: 0' : 'id'}, inplace=True)

In [19]:
data['start'] = pd.to_datetime(data['start'])
data['end'] = pd.to_datetime(data['end'])

# Function to filter data within a given range
def select_data_within_range(df, start_column, start_date, end_date):
    if not pd.api.types.is_datetime64_any_dtype(df[start_column]):
        df[start_column] = pd.to_datetime(df[start_column])

    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    return df[(df[start_column] >= start_date) & (df[start_column] <= end_date)]

# Define training and evaluation time periods
train_start, train_end = "2022-01-01 00:00:00", "2022-01-12 22:30:00"
eval_start, eval_end = "2022-01-12 22:30:00", "2022-01-31 23:59:00"

# Split dataset into training and evaluation sets
training_data = select_data_within_range(data, 'start', train_start, train_end)
evaluation_data = select_data_within_range(data, 'start', eval_start, eval_end)

# Select features and target variable
features = training_data[['prev_tput', 'prev_rtt', 'prev_retx_rate', 'interval', 'prev_size', 'size_ratio']]
target = np.log(training_data['tput']).values.ravel()


In [22]:
# Split the dataset into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(features, target, test_size=0.2, random_state=77)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.4, random_state=77)

# Define gamma parameters
gamma_params = {
    "selection_mode": "multiplicative",
    "fit_mode": "vanilla",
    "threshold": 0.1,
    "clip_mode": "no_scale"
    }

pcs_uq = PCS_UQ(models)

# Train PCS_UQ model
results = pcs_uq.train(
    x_train=X_train, 
    x_val=X_val, 
    y_train=y_train, 
    y_val=y_val, 
    file_name='Jan_0529'
)

# Perform calibration
val_results_df = pcs_uq.calibrate(
    x_val=X_val,
    y_val=y_val,
    gamma_params=gamma_params,
    best="all"
)

# Make predictions on the test set
test_results_df = pcs_uq.predict(x_test=X_test)

# Evaluate the model
pcs_evaluation_results = pcs_uq.evaluate(y_test=y_test)

Evaluation complete, here's the result
  model_group         model          rmse            mae            r2
0      linear         Ridge  1.201309e+06  140055.052935 -2.503989e+12
1      linear         Lasso  4.889038e-01       0.307635  5.852658e-01
2      linear    ElasticNet  4.889038e-01       0.307635  5.852658e-01
3     bagging    ExtraTrees  7.162757e-01       0.523813  1.098087e-01
4     bagging  RandomForest  2.166541e-01       0.104303  9.185565e-01
5    boosting       XGBoost  2.136768e-01       0.086262  9.207795e-01
6    boosting      AdaBoost  8.394945e-01       0.780932 -2.228092e-01
Bootstrapping
processing data


In [23]:
pcs_evaluation_results

Unnamed: 0,coverage,avg_length,median_length,range_y_test,alpha,scaled_avg_length,scaled_median_length
0,0.870696,0.607568,0.559539,10.898026,0.1,0.05575,0.051343
