# Reference Run Ranking Performance Testing & Development Notebook

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

import sys
sys.path.append("../src/")
import refrunrank as rrr

sys.path.append("../../../DQMExplore/DQMExploreDEV/src/")
import dqmexplore as dqme

from cmsdials.filters import OMSFilter, OMSPage

dials = dqme.utils.setupdials.setup_dials_object_deviceauth()

## CertHelper

In [None]:
import importlib
importlib.reload(rrr.certhelper)

In [None]:
json_dir = "/eos/home-i02/r/rcruzcan/SWAN_projects/RefRunRank/RefRunRank/jsons/"
ch_refrunjson = "ch_refrunjson.json"
# rr_goldenjson = "Express-Collisions2023_pixel-good_strip-good_track-good.json"
# rr_goldenjson = "rr_golden.json"
rr_goldenjson = 'Express-Collisions-2022-2023-2024_pixel-strip-track-good.json'

chdata = rrr.certhelper.CHRunData(
    os.path.join(json_dir, ch_refrunjson),
    os.path.join(json_dir, rr_goldenjson)
)

chdata.getGoodRuns()

In [None]:
filters = {
    # "run_number":[(352493, 355101)],
    # "run_number": ,
    # "reference_run_number": 312727,
    "run_reconstruction_type": "express",
    # "reference_run_reconstruction_type": "express",
    # "dataset": "/Express/Collisions*/*"
}

chdata.applyFilter(filters=filters)

## OMS

In [None]:
import importlib
importlib.reload(rrr.omsdata)

In [None]:
runnbs = chdata.applyFilter(filters=filters)["run_number"].to_list()
omsdata = rrr.omsdata.OMSData(dials)

# Set the runs (i.e. filters to specify runs) that we will get data for 
omsdata.setRuns(runnbs[:100]) 
# Downloads the requested data
omsdata.fetchData("runs")
omsdata.fetchData("lumisections")

# Ranking

In [None]:
import importlib
importlib.reload(rrr.ranking)

In [None]:
# 'delivered_lumi_per_lumisection', 'recorded_lumi_per_lumisection',
# 'init_lumi', 'recorded_lumi', 'end_lumi', 'lumisection_number',
# 'pileup', 'delivered_lumi', 'fill_number', 'prescale_index',
# 'run_number'
#'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'
ranker = rrr.ranking.RunRanker(omsdata)

target = 356075
ftrs_dict = {
    "runs": [
        # "b_field",
        "recorded_lumi",
        "delivered_lumi",
        "energy",
        "hlt_physics_rate",
        "fill_number",
        "run_number",
        "hlt_physics_counter",
    ],
    "lumisections": {
        "mean":["init_lumi", "recorded_lumi_per_lumisection", "pileup"],
        "std": ["init_lumi", "recorded_lumi_per_lumisection", "pileup"],
        "min": ["init_lumi", "recorded_lumi_per_lumisection", "pileup"],
        "max": ["init_lumi", "recorded_lumi_per_lumisection", "pileup"],
        "50%": ["init_lumi", "recorded_lumi_per_lumisection", "pileup"],
    }
}

ranker.setFeatures(ftrs_dict)
ranker.constructFeatures()
rslts, wghts = ranker.refrank_pca(target, n_components=2)

In [None]:
# keys = list(wghts.keys())
# values = list(wghts.values())
sorted_items = sorted(zip(values, keys))  # Sort by values
values, keys = zip(*sorted_items)  # Unzip into sorted values and keys

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(keys, values)
ax.set_xlabel("Feature")
ax.set_ylabel("Weight")
ax.set_title("PCA Feature Weights")
ax.set_xticklabels(keys, rotation=90)
plt.show()

wghts_df = pd.DataFrame(list(wghts.items()), columns=["Feature", "Weight"])
wghts_df = wghts_df.sort_values(by="Weight", ascending=False).reset_index(drop=True)
wghts_df

In [None]:
import seaborn as sns
sns.pairplot(ranker.ftrsDF, corner=True)
plt.suptitle("Feature Correlation Grid", y=1.02)
plt.show()

In [None]:
corr_mtrx = ranker.ftrsDF.corr()
corr_mtrx = corr_mtrx.fillna(0)
corr_mtrx

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, leaves_list

linkage_matrix = linkage(corr_mtrx, method='complete')
order = leaves_list(linkage_matrix)

# Reordering
ordered_corr_mtrx = corr_mtrx.iloc[order, order]

# Plot reordered heatmap
plt.figure(figsize=(20, 14))  # Increase figure size
sns.heatmap(ordered_corr_mtrx, annot=True, vmin=-1, vmax=1, cmap="BrBG", 
            cbar_kws={"label": "Correlation"}, annot_kws={"size": 8}) 
plt.title("Clustered Correlation Matrix", fontsize=16)

plt.xticks(rotation=90, ha='right', fontsize=10) 
plt.yticks(fontsize=10)
plt.tight_layout() 
plt.show()

We now use hierarchical clustering analysis to weed out highly correlated features.
- `1-corr_mtrx.abs()` -> we turn the correlation matrix into a distance matrix, where lower distance implies higher correlation (closer in "correlation space"), while higher distance implies lower correlation (further apart in "correlation space")
- `linkage` -> provides complete history of how the cluster were merged at each step, which is the basis for building a hierarchical tree (dendogram); each number represents a cluster location
- `linkage_mtrx` -> column 1 = index of first cluster being merged, column 2 = index of second cluster being merged, column 3 -> dsitance between the clusters being merged, column 4 -> total number of original items in the new cluster formed by the merge

Notes:
- Hierarchical clustering order rows and/or cols based on similarity
- Makes it easy to see correlation in the data
- In each step at the clustering step, we cluster based on how similarly each feature correlates with the same other features. Those features that cluster most similarly to other features are clustered together and then this cluster is treated as a feature itself in the next clustering step.
- More abstractly, those features in correlation space which are closest are clustered together and this cluster is treated as a feature itself, with its location given by (in the case for "complete") the longest distance. Distance here is measured using, or instance and typically, the Euclidian distance.

Documentation
- `linkage` -> https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

In [None]:
corr_dist = 1-corr_mtrx.abs()
corr_dist

In [None]:
# Hierarchical cluster analysis
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
corr_dist = 1-corr_mtrx.abs()
dist_condensed = corr_dist.values[np.triu_indices_from(corr_dist, k=1)] # condensing
linkage_mtrx = linkage(dist_condensed, method="complete")

plt.figure(figsize=(8,5))
dendrogram(linkage_mtrx, labels=corr_dist.columns, leaf_rotation=90)
plt.xlabel("Features")
plt.ylabel("Distance")
plt.show()

In [None]:
corr_threshold = 0.7
clusters = fcluster(linkage_mtrx, t=corr_threshold, criterion="distance")

clustered_ftrs = {}
for idx, cluster_id in enumerate(clusters):
    feature = corr_mtrx.columns[idx]
    if cluster_id not in clustered_ftrs:
        clustered_ftrs[cluster_id] = [feature]
    else:
        clustered_ftrs[cluster_id].append(feature)
clustered_ftrs

In [None]:
selected_ftrs = []
for cluster, features in clustered_ftrs.items():
    if len(features) == 1:
        selected_ftrs.append(features[0])
    else:
        top_ftr = wghts_df[wghts_df["Feature"].isin(features)]["Weight"].idxmax()
        selected_ftrs.append(top_ftr)
selected_ftrs = list(wghts_df.iloc[selected_ftrs]["Feature"])
selected_ftrs

In [None]:
ranker = rrr.ranking.RunRanker(omsdata)

target = 356075
ftrs_dict = {
    "runs": [
        "delivered_lumi",
        "fill_number",
        "hlt_physics_rate",
    ],
    "lumisections": {
        "mean":["init_lumi"],
        "min": ["init_lumi", "pileup"],
    }
}

ranker.setFeatures(ftrs_dict)
ranker.constructFeatures()

rslts, wghts = ranker.refrank_pca(target, n_components=2)

In [None]:
rslts