# Chart proxyhistograms

Using `nodetool proxyhistograms` output, create graphs to easily see latency across nodes.

Two modes are available:
- If the output file contains output from only a single run of proxyhistograms (e.g., from a diagnostic tarball), then set `has_timestamps` to `False` to analyze.
- If the output file contains multiple outputs from a script, then set `has_timestamps` to `True` to analyze. The expected file format is as follows:

```
10.10.10.10
2024-03-11_11:29:35
proxy histograms
Percentile       Read Latency      Write Latency      Range Latency   CAS Read Latency  CAS Write Latency View Write Latency
                     (micros)           (micros)           (micros)           (micros)           (micros)           (micros)
50%                    790.56              84.95             180.22               0.00               0.00               0.00
75%                   1132.13             155.56             360.45               0.00               0.00               0.00
...
2024-03-11_11:44:35
proxy histograms
Percentile       Read Latency      Write Latency      Range Latency   CAS Read Latency  CAS Write Latency View Write Latency
                     (micros)           (micros)           (micros)           (micros)           (micros)           (micros)
50%                    790.56              84.95             180.22               0.00               0.00               0.00
75%                   1132.13             155.56             360.45               0.00               0.00               0.00
...
```

The timestamp is produced with `date +"%Y-%m-%d_%H:%M:%S"`.

In addition, the output file will have an IP address on line 1 of the file if timestamps are present. This will allow the analysis of output files from multiple nodes.

In [None]:
# GLOBAL VARIABLES
file_path = '/path/to/histogramfiles/'
# Prepare file list by getting the list of proxyhistogram files: `ls | grep proxyhisto | pbcopy`
# Then use Neovim, or sed, to add the single quotes and commas
# Commands:
# - :%s/^/'/g
# - :%s/$\n/',/g
files = ['file1','file2']
has_timestamps = True

In [None]:
# IMPORTS
import re

import datetime as datetime
import plotly.express as px
import pandas as pd

In [None]:
# Returns timestamp
def parse_timestamp(line):
    regex = r'[0-9_\:\-]+'
    match = re.search(regex, line)

    return datetime.datetime.strptime(match.group(0).replace('_', ' '), '%Y-%m-%d %H:%M:%S')

def parse_nodetool_proxyhistograms(output, has_timestamps):
    """
    Parses the output of `nodetool proxyhistograms` into a DataFrame.
    """
    lines = output.strip().split("\n")
    data = []
    current_timestamp = ""
    ip_regex = r'(([0-9]{1,3}\.){3}[0-9]{1,3})'
    current_ip = ""
    for line in lines:
        ip_match = re.search(ip_regex, line)
        if line.startswith("Percentile"):
            # Start of a new table section
            if has_timestamps:
                columns = ["Timestamp", "IP - Percentile", "Read Latency (micros)", "Write Latency (micros)",
                           "Range Latency (micros)", "CAS Read Latency (micros)", "CAS Write Latency (micros)",
                           "View Write Latency (micros)"]
            else:
                columns = ["Percentile", "Read Latency (micros)", "Write Latency (micros)",
                           "Range Latency (micros)", "CAS Read Latency (micros)", "CAS Write Latency (micros)",
                           "View Write Latency (micros)"]
        elif line.startswith("proxy"):
            continue
        elif has_timestamps and line.startswith("2024"):
            current_timestamp = parse_timestamp(line)
        elif ip_match != None:
            current_ip = ip_match.group(0)
        else:
            # Data line
            parts = line.split()
            if len(parts) == 7:  # Ensure it's a data line
                if has_timestamps:
                    parts[0] = "{} - {}".format(current_ip, parts[0])
                    data.append([current_timestamp] + parts)
                else:
                    data.append(parts)
    
    df = pd.DataFrame(data, columns=columns)
    # Convert numeric columns, handling "NaN" strings and other non-numeric values
    if has_timestamps:
        for col in df.columns[2:]:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    else:
        for col in df.columns[1:]:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [None]:
proxyhistograms = """
10.10.10.10
2024-03-11_11:29:35
proxy histograms
Percentile       Read Latency      Write Latency      Range Latency   CAS Read Latency  CAS Write Latency View Write Latency
                     (micros)           (micros)           (micros)           (micros)           (micros)           (micros)
50%                    790.56              84.95             180.22               0.00               0.00               0.00
75%                   1132.13             155.56             360.45               0.00               0.00               0.00
95%                   2070.94             482.06            5242.88               0.00               0.00               0.00
98%                   4643.69             730.26            5767.17               0.00               0.00               0.00
99%                  50331.65            1337.38            5767.17               0.00               0.00               0.00
Min                      0.00               0.00               0.00               0.00               0.00               0.00
Max                  83886.08           25165.82            6291.46               0.00               0.00               0.00
2024-03-13_15:53:05
proxy histograms
Percentile       Read Latency      Write Latency      Range Latency   CAS Read Latency  CAS Write Latency View Write Latency
                     (micros)           (micros)           (micros)           (micros)           (micros)           (micros)
50%                    790.56              84.95             180.22               0.00               0.00               0.00
75%                   1132.13             155.56             360.45               0.00               0.00               0.00
95%                   2070.94             482.06            5242.88               0.00               0.00               0.00
98%                   4643.69             730.26            5767.17               0.00               0.00               0.00
99%                  50331.65            1337.38            5767.17               0.00               0.00               0.00
Min                      0.00               0.00               0.00               0.00               0.00               0.00
Max                  83886.08           25165.82            6291.46               0.00               0.00               0.00
"""

In [None]:
proxyhistogram = """
proxy histograms
Percentile       Read Latency      Write Latency      Range Latency   CAS Read Latency  CAS Write Latency View Write Latency
                     (micros)           (micros)           (micros)           (micros)           (micros)           (micros)
50%                    790.56              84.95             180.22               0.00               0.00               0.00
75%                   1132.13             155.56             360.45               0.00               0.00               0.00
95%                   2070.94             482.06            5242.88               0.00               0.00               0.00
98%                   4643.69             730.26            5767.17               0.00               0.00               0.00
99%                  50331.65            1337.38            5767.17               0.00               0.00               0.00
Min                      0.00               0.00               0.00               0.00               0.00               0.00
Max                  83886.08           25165.82            6291.46               0.00               0.00               0.00
"""

In [None]:
df = pd.DataFrame()

for file in files:
    # Read the file contents
    with open(file_path + file, 'r') as open_file:
        nodetool_output = open_file.read()

    current_df = parse_nodetool_proxyhistograms(nodetool_output, has_timestamps)
    
    if df.empty:
        df = current_df
    else:
        df = pd.concat([df, current_df])

In [None]:
for metric in ["Read Latency (micros)", "Write Latency (micros)",
                       "Range Latency (micros)", "CAS Read Latency (micros)", "CAS Write Latency (micros)",
                       "View Write Latency (micros)"]:
    if has_timestamps:
        fig = px.line(df, x="Timestamp", y=metric, title="Proxy Histograms - {}".format(metric), color="IP - Percentile")
    else:
        fig = px.line(df, x="Percentile", y=metric, title="Proxy Histograms - {}".format(metric))
    fig.show()