In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import importlib
import numpy as np
import os, sys
import json
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../utils/'))

# local modules
import get_oms_data
# importlib.reload(get_oms_data)  # not needed unless making live changes to the modules
from get_oms_data import get_oms_api, get_oms_data, get_oms_response_attribute

import json_utils as jsonu
import plot_utils as pu

import mplhep as hep
hep.style.use("CMS")

import OMS_RR_utils
# importlib.reload(OMS_RR_utils) 
import json_utils
import OMS_RR_utils as omsu
from refruns_utils import get_reference_run as RRfetch
import refrank_utils as rrr

In [None]:
importlib.reload(rrr)

# Getting the Data
We start by instantiating the OMS API. The OMS API will be used to get Run and LS level data and will be the main source of information for the reference run ranking (RRR) system.

In [None]:
omsapi = get_oms_api()

In [None]:
# Run level attributes
run_attribs = [
    'components', 
    'init_lumi',
    'recorded_lumi', 
    'delivered_lumi',
    'start_time',
    'end_time',
    'energy', 
    'end_lumi', 
    'hlt_physics_rate',
    'fill_number', 
    'l1_hlt_mode', 
    'trigger_mode',
    'l1_key_stripped', 
    'fill_type_party2',
    'fill_type_party1', 
    'initial_prescale_index',
    'sequence', 
    'hlt_physics_size', 
    'fill_type_runtime',
    'last_lumisection_number', # refers to the last LSs where cms_active is True
    'l1_rate',
    'l1_menu', 
    'run_number',
    'stable_beam',
    'hlt_physics_counter',
#     'peak_pileup' <-- Find actual name
]

# Lumisecion (LS) level attributes
ls_attribs = [
    'fill_number',
    "run_number",
    'lumisection_number',
    "physics_flag",
    "cms_active",
    'bpix_ready',
    'fpix_ready',
    'tecm_ready',
    'tecp_ready',
    'tibtid_ready',
    'tob_ready',
    'pileup',
    'delivered_lumi',
    'recorded_lumi',
    "init_lumi",
    'end_lumi',
    'beam1_stable',
    'beam2_stable',
    'beam2_present',
    'beam1_present',
#     "l1_rate" 
]

In [None]:
# Run we wish to certify and for which we will find a correspond reference run
target = 316201
newest_run = 316201
oldest_run = 314206 # -2000

# Range of runs of interest
run_range = (oldest_run, newest_run)
print(run_range)

In [None]:
target - oldest_run

In [None]:
# Load run level data into JSON
run_json = get_oms_data( # This function is good enough to use as is, no need to circumvent entry limit
    omsapi, 
    'runs', 
    run_range, 
    limit_entries = 5_000,
    attributes = run_attribs
)

In [None]:
print("Number of runs: ", len(run_json['data']))
print("Number of lumisections will be: ~", 100 * len(run_json['data']))

In [None]:
run_df = omsu.makeDF(run_json).convert_dtypes()
run_df.info()

In [None]:
# Load lumisecion level data into JSON
ls_json = get_oms_data(
    omsapi, 
    'lumisections', 
    run_range, 
    limit_entries=100_000,
    attributes=ls_attribs
)

In [None]:
len(ls_json['data'])

Loading data into dataframes

In [None]:
# Convert JSON into DF
run_df = omsu.makeDF(run_json).convert_dtypes()
ls_df = omsu.makeDF(ls_json).convert_dtypes()

In [None]:
run_df.head()

In [None]:
ls_df.head()

In [None]:
# Optional: Save to parquet
run_df.to_parquet('/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/data/run_df.parquet')
ls_df.to_parquet('/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/data/ls_df.parquet')

In [None]:
# Optional: Load from parquet
run_df = pd.read_parquet('/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/data/run_df.parquet')
ls_df = pd.read_parquet('/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/data/ls_df.parquet')

In [None]:
# Quick check of the loaded runs and LSs
print(len(run_df))
print(len(ls_df))

In [None]:
run_df[[
    "run_number", #
    "init_lumi", #
    "recorded_lumi",
    "energy",
    "end_lumi", #
    "hlt_physics_rate",
    "fill_number",
    "initial_prescale_index",
    "last_lumisection_number",
    "l1_rate",
    "hlt_physics_counter",
]].info()

Some of the rows has `fill_number = NaN`. This is problematic, so we will segment the rows into those that have this problem (`run_df_NaN`), and the rest (`run_df`). We also add `Fill Location` column and make the `run_number` column into an index column.

<font color='red'> INVESTIGATE WHY THESE RUNS/LSs HAVE NaN VALUES. FIGURE WHERE THESE LSs TEND TO HAPPEN (START OR END OF THE RUN). </font>

In [None]:
# Filter out runs that have NaN in fill number
run_df, run_df_NaN = omsu.has_fill(run_df)
print(
    '''
    Runs kept in run dataframe: {}
    Runs filtered out: {}
    Lowest run # kept: {}
    Highest run # kept: {}
    '''.format(
        len(run_df['run_number'].unique()), 
        len(run_df_NaN['run_number'].unique()),
        run_df['run_number'].min(),
        run_df['run_number'].max())
)

run_df = omsu.add_loc_wrt_fill(run_df)

run_df.set_index(['run_number'], inplace=True)
run_df.sort_index(level=['run_number'], inplace=True)

In [None]:
ls_df, ls_df_NaN = omsu.has_fill(ls_df)
print(
    '''
    Runs kept in lumisection dataframe: {}
    Runs filtered out: {}
    Lowest run # kept: {}
    Highest run # kept: {}
    '''.format(
        len(ls_df['run_number'].unique()),   
        len(ls_df_NaN['run_number'].unique()),
        ls_df['run_number'].min(),
        ls_df['run_number'].max()
    )
)

ls_df = omsu.add_loc_wrt_fill(ls_df)
ls_df = ls_df.convert_dtypes()

ls_df.set_index(['run_number', 'lumisection_number'], inplace=True)
ls_df.sort_index(level=['run_number', 'lumisection_number'], inplace=True)

We can now take a look at the data that was filtered out and the run data that is left

In [None]:
run_df.info()

In [None]:
ls_df.info()

In [None]:
# Checking which runs appear in one dataframe, but not the other, and dropping them from both dataframes.

rundf_runs = run_df.index.to_list()
lsdf_runs = ls_df.index.get_level_values(0)

# Finding symmetric different
missing_runs = list(set(rundf_runs) ^ set(lsdf_runs))
print(
    '''
    List of runs which are in one of the dataframes, but not the other (and viceversa): \n    {}
    '''.format(missing_runs)
)

# Going through each dataframe and deleting these runs
for run in missing_runs:
    if run in rundf_runs:
        run_df.drop(run, inplace=True)
    else:
        ls_df.drop(run, inplace=True)

In [None]:
# Sanity check that the run filtering step worked

rundf_runs = run_df.index.to_list()
lsdf_runs = ls_df.index.get_level_values(0)

# Finding symmetric different
missing_runs = list(set(rundf_runs) ^ set(lsdf_runs))
print(
    '''
    List of runs which are in one of the dataframes, but not the other (and viceversa): \n    {}
    '''.format(missing_runs)
)

In [None]:
print(
    '''
    Number of runs in the run dataframe: {}
    Number of runs in the lumisection dataframe: {}
    Are the runs contained in these dataframes exactly the same?: {}
    '''.format(
        len(run_df.index.unique()),
        len(ls_df.index.get_level_values(0).unique()),
        run_df.index.unique().tolist() == ls_df.index.get_level_values(0).unique().tolist()
    )
)

In [None]:
run_df.head()

In [None]:
ls_df.head()

# Filtering Good & Collisions Runs/LSs

Note: It might be neccesary to generate a new golden JSON in RR depending on the range of runs chosen for ranking.

In [None]:
# Relevant golden jsons available in the json directory
golden_files = [
    'json_GOLDEN_2017.json', 
    'json_GOLDEN_2018.json',
    'json_GOLDEN_RRRdev.json',
    'json_TRK_PromptReco_314324to316201.json',
]

Generated golden JSON such as `json_GOLDEN_RRRdev.json` made in RR using input similar to:
```json
{
  "and": [
    {">=": [{"var": "run.oms.run_number"}, 315190]},
    {"<=": [{"var": "run.oms.run_number"}, 316201]},
    {"==": [{"var": "lumisection.rr.tracker-pixel"}, "GOOD"]},
    {"==": [{"var": "lumisection.rr.tracker-strip"}, "GOOD"]},
    {"==": [{"var": "lumisection.rr.tracker-track"}, "GOOD"]}
  ]
}
```


In [None]:
# We now get collision runs/LSs
runcoll_df, lscoll_df, runnotcoll_df, lsnotcoll_df = omsu.get_collisions(run_df.reset_index(), ls_df.reset_index())

In [None]:
print(
    '''
        We originally had this amount of runs:    {}
        Amount of these runs that are collisions: {}
    '''.format(len(runcoll_df['run_number'].tolist()) + len(runnotcoll_df['run_number'].tolist()),
              len(runcoll_df['run_number'].unique()))
)
    

In [None]:
print(
    '''
    Number of runs in collision dataframe:                   {}
    Number of runs in LS dataframe:                          {}
    Runs that are in one, but not the other (and viceversa): {}
    '''.format(
        len(runcoll_df['run_number'].unique()),
        len(lscoll_df['run_number'].unique()),
        list(set(runcoll_df['run_number'].tolist()) ^ set(lscoll_df['run_number'].unique().tolist()))
    )
)

In [None]:
# Dividing good runs/LSs from bad runs/LSs
good_mask = json_utils.injson(
    np.array(lscoll_df['run_number']), 
    np.array(lscoll_df['lumisection_number']), 
    '/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/jsons/'+golden_files[3])

lscollgood_df = lscoll_df[good_mask]
lscollbad_df = lscoll_df[~good_mask]

In [None]:
runcollgood_df = runcoll_df[runcoll_df['run_number'].isin(lscollgood_df['run_number'].unique())]
runcollbad_df = runcoll_df[~runcoll_df['run_number'].isin(lscollgood_df['run_number'].unique())]

In [None]:
print(
    '''
        Number of good collision LSs: {}
        Number of bad collisions LSs: {}
    '''.format(len(lscollgood_df), len(lscollbad_df))
)

In [None]:
print(
    '''
        Number of good collision LSs: {}
        Number of bad collisions LSs: {}
    '''.format(len(runcollgood_df), len(runcollbad_df))
)

<h3>Data Exploration</h3>

We create some plots to better understand how this data looks like

In [None]:
lumi_info = lscollgood_df[['run_number',
                           'init_lumi', 
                           'end_lumi',
                           'pileup',
                           'recorded_lumi', 
                           'delivered_lumi',
                           'lumisection_number',]]
lumi_info

In [None]:
run_begin = 315357
run_end = 315366

lumi_info['run_lumisection'] = list(zip(lumi_info['run_number'], lumi_info['lumisection_number']))

filtered_lumi_info = lumi_info[(lumi_info['run_number'] >= run_begin) & (lumi_info['run_number'] <= run_end)]

fig, ax = plt.subplots(dpi=200)

current_run = None
for idx, (run, lumisection) in enumerate(filtered_lumi_info['run_lumisection']):
    if run != current_run:
        ax.axvline(x=idx, color='gray', linestyle='--', linewidth=0.5)  # Add a vertical line
        
        # Add a label for the run number
        if current_run is not None:  # Skip label for the very first line
            label_x_position = idx - 1  # Adjust this as needed to position the label correctly
            ax.text(label_x_position, 0.95, str(current_run), transform=ax.get_xaxis_transform(), 
                    horizontalalignment='right', verticalalignment='top', fontsize=6, color='gray', rotation='vertical')
            
        current_run = run
        
        
features_to_plot = [
    'init_lumi', 
    'pileup', 
    'recorded_lumi', 
    'delivered_lumi'
]

filtered_lumi_info.plot(x='run_lumisection', y=features_to_plot, secondary_y='pileup', ax=ax)
ax.tick_params(axis='both', which='major', labelsize=6)
ax.set_yscale('log')
plt.title(f"Run {run_begin} to {run_end}")
plt.show()

In [None]:
runcollgood_df[runcollgood_df['run_number']==315357]

In [None]:
lscollgood_df[lscollgood_df['run_number']==315357].head(50)

In [None]:
lumi_info[(lumi_info['run_number'] >= run_begin) & (lumi_info['run_number'] <= run_end)].set_index('run_number')

In [None]:

# lscoll_df[lscoll_df['run_number']].plot(y=['recorded_lumi','delivered_lumi','pileup'],style='-',figsize=(20,9),logy=False,logx=False,secondary_y="pileup")
lscollgood_df[lscollgood_df['run_number'] < 315270].plot(y=['recorded_lumi','delivered_lumi','pileup'],style='-',figsize=(20,9),logy=False,logx=False,secondary_y="pileup")
# plt.xlim((0,3800))
plt.show()
# lsdf_collisions.plot(y=['init_lumi','end_lumi'],x='run_number' ,figsize=(19,9),logy=False,logx=False)

In [None]:
lscollgood_df['run_number'].unique()

In [None]:
fig, ax = plt.subplots(dpi=150)
lscollgood_df[lscollgood_df['run_number'] <= 315264].reset_index().plot(y=['recorded_lumi', 'delivered_lumi', 'pileup'], ax=ax)

plt.show()

In [None]:
lscollgood_df[lscollgood_df['run_number'] == 315259][['recorded_lumi', 'delivered_lumi', 'pileup']]

In [None]:
lscollgood_df[lscollgood_df['run_number'] <= 315265].reset_index()

In [None]:
fig, ax = plt.subplots(dpi=200)

runcollgood_df.plot(y=['recorded_lumi',"delivered_lumi"], x="run_number",
                kind="bar", figsize=(19,9), logy=False, logx=False, ax=ax);

# Previously Developed Ranking Systems

In [None]:
pd.options.mode.chained_assignment = None

<h2> Pre-existing RRR systems </h2>

The implementations previously developed used a dictionary as input where the elements of the dictionary were the run and LS dataframes. Therefore, we create such a dictionary.

In [None]:
# Making the dictionary out of dataframes
data_dict = {'runs': runcollgood_df.reset_index(), 'lumisections': lscollgood_df.reset_index()}

In [None]:
# # Run we wish to certify and for which we will find a correspond reference run
# target = 316201
# oldest_run = target - 1011

# # Range of runs of interest
# run_range = (oldest_run, target)
# print(run_range)

We also fetch the actual RR used for the certification of the target run.

In [None]:
targetRR = RRfetch(target, jsonfile='/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/utils/json_allRunsRefRuns.json')
print('Actual RR used: ' + str(targetRR))

<h3> Version 1 </h3>
Version 1 of the RRR system gives a rank to each of the runs given by the following equation. (Note: In the original proposed version 1, the equation is slighly altered. The equation shown here is the one found in the actual implementation.)

$$
    G_1 = 0.5 *(\text{inst lumi delta }\%)  + 0.25 *(\text{pileup delta }\%) + 0.25 *(\text{run number delta } \%)
$$

where
- $\text{quantity %} = \frac{\text{possible ref quantity}-\text{target quantity}}{\text{target quantity}} * 100$

In [None]:
v1_ranking = omsu.ref_rank(data_dict, target, Trim=False, ver='V1').set_index('run_number') \
    .sort_values(by=["Run_Rank","inst_lumi_delta %",'pileup_delta %','run_number_delta'], key=lambda x: x.abs(), ascending=True)
v1_ranking.reset_index(inplace=True)
v1_ranking

We now evaluate how well this ranking system is by checking which run was actually used as reference.

In [None]:
print('Ranking of actual RR used for target:', list(np.where(v1_ranking['run_number'] == targetRR))[0][0])

<h3> Version 2 </h3>
The equation used in this version to compute the ranking is given by
$$
    G_2 = 0.5 * \frac{\text{(inst lumi %)} * \text{(run inst lumi)}}{100 * \text{(ave inst lumi)}} + 0.25 * \frac{\text{(pileup %)}}{\text{(run pileup)}} + 0.25 * \frac{\text{(run number %)}}{\text{(run num)}}
$$

In [None]:
v2_ranking = omsu.ref_rank(data_dict, target, Trim=False, ver='V2').set_index('run_number') \
    .sort_values(by=["Run_Rank","inst_lumi_delta %",'pileup_delta %','run_number_delta'], key=lambda x: x.abs(), ascending=True)
v2_ranking.reset_index(inplace=True)
v2_ranking

In [None]:
print('Ranking of actual RR used for target:', list(np.where(v2_ranking['run_number'] == targetRR))[0][0])

<h3> Version 3 </h3>
The equation used for this version is given by
$$
    G_3 = \frac{\text{inst lumi delta %}}{100} + \frac{\text{pileup delta %}}{100} + \frac{\text{run num delta}}{100} + \frac{\text{num of lumi delta %}}{100}
$$

In [None]:
v3_ranking = omsu.ref_rank(data_dict, target, Trim=False, ver='V3').set_index('run_number') \
    .sort_values(by=["Run_Rank","inst_lumi_delta %",'pileup_delta %','run_number_delta'], key=lambda x: x.abs(), ascending=True)
v3_ranking.reset_index(inplace=True)
v3_ranking

In [None]:
print('Ranking of actual RR used for target:', list(np.where(v3_ranking['run_number'] == targetRR))[0][0])

# Reference Run Ranking Using PCA

We first take a observe the features available for PCA, find how they are correlated and their weights (given by the coefficients in the first PC), and then try using PCA to rank runs.

In [None]:
print("Run level features: \n{}\n".format(runcollgood_df.iloc[0]))
print("LS level features: \n{}".format(lscollgood_df.iloc[0]))

## Run Level Features
PCA compatible features that can be considered are
- `run_number`
- `init_lumi`
- `recorded_lumi`
- `energy`
- `end_lumi`
- `hlt_physics_rate`
- `fill_number`
- `initial_prescale_index`
- `last_lumisection_number`
- `l1_rate`
- `hlt_physics_counter`

Features that we could calculate
- `delta_lumi` (change in lumi)
- `temp_dist` (how far back in time a run is with respect to the target run)

In this section we study these features.

In [None]:
runcollgood_df.tail(3)

In [None]:
# Getting total change in lumi
runcollgood_df = rrr.comp_delta_totallumi(runcollgood_df)
# Getting temporal distance of each run with respect to the target run
runcollgood_df = rrr.comp_temp_dist(runcollgood_df, 316062) # target run chosen as example
# Getting duration
runcollgood_df = rrr.comp_duration(runcollgood_df)
runcollgood_df.head(3)

In [None]:
runcollgood_df.tail(3)

We will now see how all of these variables are correlated and will also determine the weights of each of them relative to each other using the first component of PCA.

In [None]:
# Run level features that are compatible with PCA
runfeaturelst = [
    "run_number", #
    "init_lumi", #
    "recorded_lumi",
    "energy",
    "end_lumi", #
    "hlt_physics_rate",
    "fill_number",
    "initial_prescale_index",
    "last_lumisection_number",
    "l1_rate",
    "hlt_physics_counter",
    "delta_totallumi", #
    "temp_dist", #
    "delivered_lumi" #
]

In [None]:
run_features = runcollgood_df[runfeaturelst].astype(float).fillna(0)
run_features.info()

In [None]:
# Get the weights by taking the first principal component
weights = rrr.get_weights(run_features, plot=True)

In [None]:
# Organizing features by order of imporant
featureweights_dict = {feature: round(weight, 4) for feature, weight in zip(runfeaturelst, weights)}
featureweights_df = pd.DataFrame(list(feature_weight_dict.items()), columns=["Feature", "Weight"]).sort_values("Weight", ascending=False).reset_index(drop=True)
featureweights_df

Observations:
- `temp_dist`, `run_number` and `fill_number` are highly correlated. 
- `recorded_lumi` and `recorded_lumi` are highly correlated
- `energy`, `l1_rate`, `hlt_physics_counter` are very unimportant
- `last_lumisection_number` just tells us how long a run is. This is not relevant at this stage, so it is ignored for ranking.

In [None]:
# Looking only at relevant and non-highly correlated features
reducedfeaturelist = ["init_lumi", "end_lumi", "delta_totallumi", "hlt_physics_rate"]
weights_reduced = rrr.get_weights(run_features[reduced_feature_list], plot=True)

In [None]:
# Organizing features by order of imporant
featureweightsreduced_dict = {feature: round(weight, 4) for feature, weight in zip(reducedfeaturelist, weights_reduced)}
featureweightsreduced_df = pd.DataFrame(list(featureweightsreduced_dict.items()), columns=["Feature", "Weight"]).sort_values("Weight", ascending=False).reset_index(drop=True)
featureweightsreduced_df

## LS Level Features

Features of interest from LS data:
- average and std of `init_lumi`
- average and std of `end_lumi`
- average and std of `pile_up`

In [None]:
# Numerical LS level features of interest
lsfeaturelst = [
    'run_number', 
    'lumisection_number',
    'init_lumi', 
    'end_lumi', 
    'pileup'
]

In [None]:
lss = lscollgood_df[lsfeaturelst]
lss.head(5)

In [None]:
# Computing the init_lumi, end_lumi and pileup std and mean over the run
names = ['init_lumi', 'end_lumi', 'pileup']
names = [name + suffix for suffix in ['_std', '_mean'] for name in names ]

lsstats_dict = {}
for run in lss['run_number'].unique():
    runlsstats = lss[lss['run_number']==run].describe()
    stds = runlsstats.loc['std'].to_list()[2:]
    means = runlsstats.loc['mean'].to_list()[2:]
    lsstats_dict[run] = {name: stat for name, stat in zip(names, stds+means)}    
    
# Constructing a lumisection dataframe to hold these features of interest.
ls_features = pd.DataFrame(lsstats_dict).T
ls_features.head(5)

In [None]:
# Finding weights for these features
lsweights = rrr.get_weights(ls_features, plot=True)

In [None]:
# Organizing features by order of imporance
lsfeaturelst = ls_features.columns.to_list()

lsfeatureweights_dict = {feature: round(weight, 4) for feature, weight in zip(lsfeaturelst, weights)}
lsfeatureweights_df = pd.DataFrame(list(lsfeatureweights_dict.items()), columns=["Feature", "Weight"]).sort_values("Weight", ascending=False).reset_index(drop=True)
lsfeatureweights_df

## Run + LS Features (Not finished)

In [None]:
features = pd.concat([run_features.set_index('run_number'), ls_features], axis=1).reset_index()
features.rename(columns = {'index':'run_number'}, inplace=True)
features

In [None]:
features.columns.to_list()

In [None]:
weights3 = get_weights(features)
print('Fevel feature weights: ')
{feature: weight for feature, weight in zip(features.columns.to_list(), weights3)}

In [None]:
# Without run number and temporal distance
weights3 = get_weights(features[[
    'init_lumi', 
    'end_lumi', 
    'delta_totallumi', 
    'delivered_lumi', 
    'init_lumi_std', 
    'end_lumi_std',
    'pileup_std',
    'init_lumi_mean',
    'end_lumi_mean',
    'pileup_mean'
]])
print('Feature weights: ')
{feature: weight for feature, weight in zip(features[[
    'init_lumi', 
    'end_lumi', 
    'delta_totallumi', 
    'delivered_lumi', 
    'init_lumi_std', 
    'end_lumi_std',
    'pileup_std',
    'init_lumi_mean',
    'end_lumi_mean',
    'pileup_mean'
]].columns.to_list(), weights3)}

- Features of importance after testing different combinations:
    - `run_number`
    - `temp_dist`
    - `init_lumi`
    - `end_lumi`
    - `init_lumi_mean`
    - `end_lumi_mean`
- Including features related to the standard deviation of a LS-lel quantity worsened the performance of the ranking considerably.
- Including `pileup_mean`, `delta_totallumi` and/or `delivered_lumi` make the ranking performance worse, but not by much if only one of these are included at a time.

In [None]:
features_to_use = ['run_number', 'temp_dist', 'init_lumi', 'end_lumi', 'init_lumi_mean', 'end_lumi_mean']

In [None]:
features.set_index('run_number').loc[:315267]

# Ranking With PCA

In [None]:
importlib.reload(rrr)

In [None]:
# Features dataframe
features = pd.concat([run_features.set_index('run_number'), ls_features], axis=1).reset_index()
features.rename(columns = {'index':'run_number'}, inplace=True)
features

In [None]:
# Features that will be considered in the ranking
features_to_use = [
    'run_number',
    'init_lumi',
#     'recorded_lumi',
#     'energy',
    'end_lumi',
#     'hlt_physics_rate',
#     'fill_number',
#     'initial_prescale_index',
#     'last_lumisection_number',
#     'l1_rate',
#     'hlt_physics_counter',
    'delta_totallumi',
#     'temp_dist',
    'delivered_lumi',
    'init_lumi_std',
    'end_lumi_std',
    'pileup_std',
    'init_lumi_mean',
    'end_lumi_mean',
    'pileup_mean'
]

In [None]:
# Reformatting df for use in ranking
features['run'] = features['run_number'].astype(int)
features.set_index('run', inplace=True)
features.tail()

In [None]:
# Initial test that ranking system actually ranks
target = 316082
targetRR = RRfetch(target, jsonfile='/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/jsons/json_allRunsRefRuns.json')
rankings = rrr.refrank_pca(features[features_to_use].loc[:target], target, n_components=2)
print('Ranking of RR used: ', rankings.index[rankings['run']==targetRR][0])
rankings = pd.merge(rankings.set_index("run"), features[features_to_use], left_index=True, right_index=True, how='left').reset_index()
rankings.head(10)

In [None]:
# Testing a case where there will not be enough candidate runs to apply ranking
target = 314472 # Oldest run in features dataframe, so nothing to compare it to
targetRR = RRfetch(target, jsonfile='/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/jsons/json_allRunsRefRuns.json')
rankings = rrr.refrank_pca(features[features_to_use], target, n_components=1)
rankings

In [None]:
def test_ranking(features, comparison_num=60, n_components=1, print_stats=True, dpi=200):
    # Testing over many target runs to obtain better idea of performance
    RRranks = []
    results = {}

    # Loop over some of the runs that are available
    for targ in list(features.reset_index()['run'].unique())[comparison_num+1:]:
        # Get reference run
        targetRR = RRfetch(targ, jsonfile='/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/jsons/json_allRunsRefRuns.json')

        rankings = rrr.refrank_pca(features[features_to_use].loc[:targ].iloc[-comparison_num:], targ, n_components=n_components)

        # In case there are any target runs for which the number of candidate runs is 0
        if rankings is None:
            pass

        # Get the rank the actual reference got
        try: 
            actualrefrank = rankings.index[rankings['run']==targetRR][0]
            RRranks.append(actualrefrank)
            results[targ] = {'ActualRef': targetRR, 'ActualRefRank': actualrefrank}
        except: # Error due to actual reference run not being available
            pass


    results = pd.DataFrame(results).T
    
    if print_stats:
        print(results['ActualRefRank'].describe())

    fig, ax = plt.subplots(dpi=dpi)
    ax = results['ActualRefRank'].hist(bins=30)
    ax.set_title("RRR results for n={}, comparison_num={}".format(n_components, comparison_num))
    ax.set_xlabel("Rank of actual reference run")

    ax.plot([10]*10, range(0,10))

    plt.show()

In [None]:
# Features that will be considered in the ranking
features_to_use = [
    'run_number',
    'init_lumi',
#     'recorded_lumi',
#     'energy',
    'end_lumi',
#     'hlt_physics_rate',
#     'fill_number',
#     'initial_prescale_index',
#     'last_lumisection_number',
#     'l1_rate',
#     'hlt_physics_counter',
    'delta_totallumi',
    'temp_dist',
    'delivered_lumi',
    'init_lumi_std',
    'end_lumi_std',
    'pileup_std',
    'init_lumi_mean',
    'end_lumi_mean',
    'pileup_mean'
]

In [None]:
# Trying with temporal features
for i in range(1, 5):
    test_ranking(features[features_to_use], n_components=i, print_stats=True, dpi=100)

In [None]:
# Re-testing, but excluding runs that have low number of LSs
isrunlong = {}
for run in lss['run_number'].unique():
    run_length = len(lss[lss['run_number']==run])
    isrunlong[run] = run_length > 700

In [None]:
features_longruns = features.set_index('run_number')[pd.Series(isrunlong)].reset_index()
features_longruns

In [None]:
# Testing over many target runs to obtain better idea of performance
RRranks = []
results = {}

# Loop over some of the runs that are available
for targ in list(features_longruns.reset_index()['run'].unique())[34:]:
    # Get reference run
    targetRR = RRfetch(targ, jsonfile='/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/utils/json_allRunsRefRuns.json')
    
    rankings = rrr.refrank_pca(features_longruns[features_to_use], targ, n_components=1)

    # In case there are any target runs for which the number of candidate runs is 0
    if rankings is None:
        pass
    
    # Get the rank the actual reference got
    actualrefrank = rankings.index[rankings['run']==targetRR][0]
    RRranks.append(actualrefrank)
    results[targ] = {'ActualRef': targetRR, 'ActualRefRank': actualrefrank}

results = pd.DataFrame(results).T
print(results['ActualRefRank'].describe())

ranksyseff = len(results[results['ActualRefRank'] <= 10])/len(results)
print('Rank system efficiency: ', ranksyseff)

fig, ax = plt.subplots(dpi=100)
ax = results['ActualRefRank'].hist(bins=30)

ax.plot([10]*10, range(0,10))

plt.show()

# # for i in range(len(features_to_use)):
# rankings = []
# results = {}

# for targ in list(features_longruns['run_number'].unique()):
#     targetRR = RRfetch(targ, jsonfile='/eos/user/r/rcruzcan/SWAN_projects/RefRunRank/utils/json_allRunsRefRuns.json')
#     features_PC = rrr.refrank_pca(features_longruns[features_to_use], targ, n_components=2)
#     actualrefrank = features_PC.index[features_PC['run_number']==targetRR]
#     if len(actualrefrank) > 0:
#         rankings.append(actualrefrank[0])
#         results[targ] = {'ActualRef': targetRR, 'ActualRefRank': actualrefrank[0]}
    
# results = pd.DataFrame(results).T
# print(results['ActualRefRank'].describe())

# ranksyseff = len(results[results['ActualRefRank'] <= 10])/len(results)
# print('Rank system efficiency: ', ranksyseff)

# fig, ax = plt.subplots(dpi=150)
# ax = results['ActualRefRank'].hist(bins=30)

# ax.plot([10]*10, range(0,10))

# plt.show()

Trying with n > 1

In [None]:
# Standardizing
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features[features_to_use]), columns=features[features_to_use].columns)

In [None]:
# # PCA
# n_components = 1
# pca = PCA(n_components=n_components)
# pca.fit(features_scaled)

# features_PC = pd.DataFrame(pca.transform(features_scaled), columns=['PC'+str(i+1) for i in range(len(pca.components_))])
# features_PC = pd.concat([features['run_number'], features_PC], axis=1).set_index('run_number')

# # Getting distances
# dist = np.sqrt(((features_PC - features_PC.loc[target])**2).sum(axis=1))
# features_PC = pd.concat([dist, features_PC], axis=1)
# features_PC.rename(columns = {0:'dist'}, inplace=True)

# # Sorting by distance
# features_PC = features_PC.sort_values(by='dist', ascending=True).reset_index()

In [None]:
# print('Ranking of RR used: ', features_PC.index[features_PC['run_number']==targetRR][0])

In [None]:
# PCA n=1
pca = PCA(n_components=1)
pca.fit(features_scaled)

# Projecting data to sub-space
features_red1 = pd.DataFrame(pca.transform(features_scaled), columns=['PC1'])
features_red1 = pd.concat([features['run_number'], features_red1], axis=1).set_index('run_number')

# Getting distances
dist_n1 = (features_red1 - features_red1.loc[target]).abs()
dist_n1.rename(columns = {'PC1':'dist'}, inplace=True)
features_red1 = pd.concat([dist_n1, features_red1], axis=1)

# Sorting by distance
features_red1 = features_red1.sort_values(by='dist', ascending=True).reset_index()

In [None]:
features_red1

With the rankings (i.e. index of `feature_red1`) for all the runs under consideration, we can get the rank of the reference run that was actually used, which is contained in `targetRR`.

In [None]:
print('Ranking of RR used: ', features_red1.index[features_red1['run_number']==targetRR][0])

We now try with $N=2$

In [None]:
# PCA n=2
pca = PCA(n_components=2)
pca.fit(features_scaled)

# Projecting data to sub-space
features_red2 = pd.DataFrame(pca.transform(features_scaled), columns=['PC1', 'PC2'])
features_red2 = pd.concat([features['run_number'], features_red2], axis=1).set_index('run_number')

# Computing Eucledian distances
dist_n2 = np.sqrt(((features_red2 - features_red2.loc[target])**2).sum(axis=1))
features_red2 = pd.concat([dist_n2, features_red2], axis=1)
features_red2.rename(columns = {0:'dist'}, inplace=True)
features_red2 = features_red2.sort_values(by='dist', ascending=True).reset_index()

In [None]:
features_red2

In [None]:
print('Ranking of RR used: ', features_red2.index[features_red2['run_number']==targetRR][0])

Trying $N=3$...

In [None]:
# PCA n=3
pca = PCA(n_components=3)
pca.fit(features_scaled)

# Projecting data to sub-space
features_red3 = pd.DataFrame(pca.transform(features_scaled), columns=['PC1', 'PC2', 'PC3'])
features_red3 = pd.concat([features['run_number'], features_red3], axis=1).set_index('run_number')

# Computing Eucledian distances
dist_n3 = np.sqrt(((features_red3 - features_red3.loc[target])**2).sum(axis=1))
features_red3 = pd.concat([dist_n3, features_red3], axis=1)
features_red3.rename(columns = {0:'dist'}, inplace=True)
features_red3 = features_red3.sort_values(by='dist', ascending=True).reset_index()

In [None]:
features_red3

In [None]:
print('Ranking of RR used: ', features_red3.index[features_red3['run_number']==targetRR][0])