In [None]:
import os
import json
import sys
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FuncFormatter


In [None]:
#pd.set_option('display.max_columns', None)  # No limit on the number of columns
#pd.set_option('display.width', 1000)        # Adjust the total display width
#pd.set_option('display.max_colwidth', 150)   # Adjust the maximum column width

#pd.reset_option('display.max_columns')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#pd.reset_option('display.max_columns')
#pd.reset_option('display.max_rows')
#pd.reset_option('display.max_colwidth')

# gprof

In [None]:
def parse_gprof_report(filepath):
    with open(filepath, 'r') as file:
        content = file.read()

    # Find the table portion between the delimiters
    start_idx = content.find('=====\n') + 6  # Skip the first delimiter
    end_idx = content.find('\n=====', start_idx)
    table_content = content[start_idx:end_idx]

    # Prepare the DataFrame columns
    columns = ['% time', 'cumulative seconds', 'self seconds', 'calls', 'self ns/call', 'total ns/call', 'name']
    data = []

    for line in table_content.split('\n'):
        if line.strip():  # Ensure the line is not empty
            # Split the line into parts, handling the special case for the name column
            parts = line.split(None, 6)  # Split on whitespace, but only for the first 6 columns
            if len(parts) < 7:
                # Adjust for lines with missing values, especially for functions like main()
                parts = parts[:3] + [None, None, None] + [' '.join(parts[3:])] if len(parts) > 3 else parts + [None]*(7-len(parts))
            data.append(parts)

    cpu_freq = 4.7
    cpu_period = 1/cpu_freq
    # Convert list to DataFrame
    df = pd.DataFrame(data, columns=columns)
    # Handle data types
    df['% time'] = pd.to_numeric(df['% time'], errors='coerce')
    df['cumulative seconds'] = pd.to_numeric(df['cumulative seconds'], errors='coerce')
    df['self seconds'] = pd.to_numeric(df['self seconds'], errors='coerce')
    df['calls'] = pd.to_numeric(df['calls'], errors='coerce')
    df['calls'] = df['calls'].astype('Int64')
    df['self ns/call'] = pd.to_numeric(df['self ns/call'], errors='coerce')
    df['total ns/call'] = pd.to_numeric(df['total ns/call'], errors='coerce')
    df['self cycles/call'] = df['self ns/call'] / cpu_period
    df['total cycles/call'] = df['total ns/call'] / cpu_period
    # round up to the nearest integer if value is not NaN
    df['self cycles/call'] = df['self cycles/call'].apply(lambda x: int(x) if not pd.isna(x) else x).astype('Int64')
    df['total cycles/call'] = df['total cycles/call'].apply(lambda x: int(x) if not pd.isna(x) else x).astype('Int64')
    # move column 'name' to the end
    cols = list(df.columns)
    cols.remove('name')
    df = df[cols + ['name']]

    return df

In [None]:
#filepath = 'analysis.gprof'
#df = parse_gprof_report(filepath)
#df = df.sort_values(by='self ns/call', ascending=False)
#df.head(12)

In [None]:
#filepath = '/home/alek/dev/ama-riscv-sim/src/analysis_alu-hash_1_no-exec-check.gprof'
#df = parse_gprof_report(filepath)
#df = df.sort_values(by='self ns/call', ascending=False)
#df.head(12)

# JSON log profiling

# test bit

In [None]:
t_json_log_1 = "../src/prime_numbers_prime_numbers_inst_profiler.json"
t_trace_1 = "../src/prime_numbers_prime_numbers_trace.bin"
t_rv_dasm_1 = "../sw/baremetal/prime_numbers/prime_numbers.dump"

t_json_log_2 = "../src/vector_mac_vm_uint8_inst_profiler.json"
t_trace_2 = "../src/vector_mac_vm_uint8_trace.bin"
t_rv_dasm_2 = "../sw/baremetal/vector_mac/vm_uint8.dump"

t_json_log_3 = "../src/asm_test_asm_test_inst_profiler.json"
t_trace_3 = "../src/asm_test_asm_test_trace.bin"
t_rv_dasm_3 = "../sw/baremetal/asm_test/asm_test.dump"

t_json_log_4 = "../src/fibonacci_n_18_inst_profiler.json"
t_trace_4 = "../src/fibonacci_n_18_trace.bin"
#t_trace_4 = "../src/fibonacci_n_18_trace.bin"
t_rv_dasm_4 = "../sw/baremetal/fibonacci/n_18.dump"

t_json_log_5 = "../src/factorial_n_20_inst_profiler.json"
t_trace_5 = "../src/factorial_n_20_trace.bin"
t_rv_dasm_5 = "../sw/baremetal/factorial/n_20.dump"

t_json_log_6 = "../src/gcd_lcm_gcd_lcm_inst_profiler.json"
t_trace_6 = "../src/gcd_lcm_gcd_lcm_trace.bin"
t_rv_dasm_6 = "../sw/baremetal/gcd_lcm/gcd_lcm.dump"

nf="int16"
nf="float32"
op="add"
t_json_log_7 = f"../src/vector_basic_{op}_{nf}_inst_profiler.json"
t_trace_7 = f"../src/vector_basic_{op}_{nf}_trace.bin"
t_rv_dasm_7 = f"../sw/baremetal/vector_basic/{op}_{nf}.dump"

t_json_log_8 = "../src/uart_loopback_uart_loopback_inst_profiler.json"
t_trace_8 = "../src/uart_loopback_uart_loopback_trace.bin"
t_rv_dasm_8 = "../sw/baremetal/uart_loopback/uart_loopback.dump"

nf="uint32"
op="quick"
t_json_log_9 = f"../src/sorting_{op}_{nf}_inst_profiler.json"
t_trace_9 = f"../src/sorting_{op}_{nf}_trace.bin"
t_rv_dasm_9 = f"../sw/baremetal/sorting/{op}_{nf}.dump"

In [None]:
hl_test = "lw,lh,lb,lhu,lbu,sw,sh,sb bne,beq,blt,bge,bgeu,bltu jal,jalr"
#hl_test = "sb bne jal,jalr"
hw_perf_metrics_path = "hw_perf_metrics.json"


In [None]:
#%run analyze_profiling_log.py -i {t_json_log_4} --highlight {hl_test}
%run analyze_profiling_log.py -t {t_trace_9} \
                           --dasm {t_rv_dasm_9} --highlight {hl_test} --pc_time_series_limit 60000 --save_csv
#                           --silent --symbols_only --save_symbols
#                           --pc_begin 0x80000094 --pc_end 0x800000ec
#                           --dasm {t_rv_dasm_2} --pc_time_series_limit 60000
#                           --dasm {t_rv_dasm_2} --highlight {hl_test}  --symbols_only

#%run analyze_profiling_log.py -i {t_json_log_2} --highlight {hl_test} --estimate_perf
#%run analyze_profiling_log.py -i {t_json_log_1} --highlight {hl_test}
#%run analyze_profiling_log.py -i {t_json_log_1} --highlight "addi add" --estimate_perf
#%run analyze_profiling_log.py -i {t_json_log_1} --highlight "bne,beq,blt,bge,bgeu jal" --estimate_perf
#%run analyze_profiling_log.py --inst_dir ../test/temp_prof/ --highlight {hl_test} --estimate_perf

#%run analyze_profiling_log.py -p {t_trace_2} --dasm {t_rv_dasm_2} --pc_time_series_limit 171000
#%run analyze_profiling_log.py -p {t_trace_2}

#%run analyze_profiling_log.py -p {t_trace_2} --dasm {t_rv_dasm_2}
#%run analyze_profiling_log.py -p {t_trace_2} --highlight {hl_test}
#%run analyze_profiling_log.py -p {t_trace_2} --dasm {t_rv_dasm_2} --highlight "bne,beq,blt,bge,bgeu,bltu jal,jalr"

#%run analyze_profiling_log.py -p {t_trace_2} --dasm {t_rv_dasm_2} --save_png --silent --save_pdf --save_csv --save_svg
#%run ../script/analyze_profiling_log.py --inst_dir ../test --highlight {hl_test} --allow_zero --combined_only --estimate_perf --save_png --save_pdf --save_csv
#%run analyze_profiling_log.py -i {t_json_log_1} {t_json_log_2} --highlight {hl_test} --estimate_perf

#%run analyze_profiling_log.py -p {t_trace_1} --dasm {rt_v_dasm_1} --save_pdf --silent
#%run analyze_profiling_log.py -p {t_trace_1} {t_trace_2}
#%run analyze_profiling_log.py --pc_dir ../src


In [None]:
%run perf_est.py {t_json_log_9} {hw_perf_metrics_path}

vector_basic_add_float32_inst_profiler.json
Branches total: 620 out of 3641 total instructions (17.0% branches)
    Taken: 355, Forwards: 221, Backwards: 134
    Not taken: 265, Forwards: 143, Backwards: 122
    Predicted: 277, Mispredicted: 343, Accuracy: 44.7%
    Cycles: Original/With prediction: 4475/4198 (277 cycles saved)
Potential app speedup: 6.2%
Estimated HW performance at 125MHz with 4475 cycles executed: CPI=1.23, exec time=35.8us, MIPS=101.7
Estimated HW performance at 125MHz with 4198 cycles executed: CPI=1.15, exec time=33.6us, MIPS=108.4

# complete analysis

In [None]:
%run analysis_all.py sorting > sorting_analysis.log
# produces csvs also in the ../src (or wherever the source traces are)

## prime bool
prime_numbers_prime_numbers_inst_profiler.json
Branches total: 11333 out of 64608 total instructions (17.5% branches)
    Taken: 11036, Forwards: 117, Backwards: 10919
    Not taken: 297, Forwards: 194, Backwards: 103
    Predicted: 11113, Mispredicted: 220, Accuracy: 98.1%
    Cycles: Original/With prediction: 76057/64944 (11113 cycles saved)
Potential app speedup: 14.6%
Estimated HW performance at 125MHz with 76057 cycles executed: CPI=1.18, exec time=608.5us, MIPS=106.2
Estimated HW performance at 125MHz with 64944 cycles executed: CPI=1.01, exec time=519.6us, MIPS=124.4

## O0
factorial_factorial_inst_profiler.json
Branches total: 985 out of 5279 total instructions (18.7% branches)
    Taken: 763, Forwards: 329, Backwards: 434
    Not taken: 222, Forwards: 194, Backwards: 28
    Predicted: 628, Mispredicted: 357, Accuracy: 63.8%
    Cycles: Original/With prediction: 6369/5741 (628 cycles saved)
Potential app speedup: 9.9%
Estimated HW performance at 125MHz with 6369 cycles executed: CPI=1.21, exec time=51.0us, MIPS=103.6
Estimated HW performance at 125MHz with 5741 cycles executed: CPI=1.09, exec time=45.9us, MIPS=114.9

## 01
factorial_factorial_inst_profiler.json
Branches total: 983 out of 4880 total instructions (20.1% branches)
    Taken: 762, Forwards: 329, Backwards: 433
    Not taken: 221, Forwards: 194, Backwards: 27
    Predicted: 627, Mispredicted: 356, Accuracy: 63.8%
    Cycles: Original/With prediction: 5966/5339 (627 cycles saved)
Potential app speedup: 10.5%
Estimated HW performance at 125MHz with 5966 cycles executed: CPI=1.22, exec time=47.7us, MIPS=102.2
Estimated HW performance at 125MHz with 5339 cycles executed: CPI=1.09, exec time=42.7us, MIPS=114.3

## 02
factorial_factorial_inst_profiler.json
Branches total: 629 out of 2581 total instructions (24.4% branches)
    Taken: 440, Forwards: 170, Backwards: 270
    Not taken: 189, Forwards: 155, Backwards: 34
    Predicted: 425, Mispredicted: 204, Accuracy: 67.6%
    Cycles: Original/With prediction: 3281/2856 (425 cycles saved)
Potential app speedup: 13.0%
Estimated HW performance at 125MHz with 3281 cycles executed: CPI=1.27, exec time=26.2us, MIPS=98.3
Estimated HW performance at 125MHz with 2856 cycles executed: CPI=1.11, exec time=22.8us, MIPS=113.0

# GCD

In [None]:
17**3 * 5**2 * 17 * 5 * 3*5

In [None]:
17**3 * 2**5 * 3

# vector basic O0 vs O2 compare

In [None]:
add_float32 = (4990 - 3641)/4990
add_float64 = (6103 - 4822)/6103
add_int16 = (1903 - 646)/1903
add_int32 = (1839 - 646)/1839
add_int64 = (2415 - 1030)/2415
add_int8 = (1839 - 646)/1839
add_uint16 = (1903 - 646)/1903
add_uint32 = (1839 - 646)/1839
add_uint64 = (2415 - 1030)/2415
add_uint8 = (1839 - 646)/1839

div_float32 = (28230 - 26881)/28230
div_float64 = (75330 - 74049)/75330
div_int16 = (3700 - 2348)/3700
div_int32 = (4263 - 2943)/4263
div_int64 = (26787 - 25403)/26787
div_int8 = (3657 - 2369)/3657
div_uint16 = (3747 - 2428)/3747
div_uint32 = (4138 - 2818)/4138
div_uint64 = (25481 - 24097)/25481
div_uint8 = (3715 - 2460)/3715

mul_float32 = (15759 - 14410)/15759
mul_float64 = (50037 - 48756)/50037
mul_int16 = (6186 - 4834)/6186
mul_int32 = (6125 - 4805)/6125
mul_int64 = (18694 - 17310)/18694
mul_int8 = (5425 - 4137)/5425
mul_uint16 = (4764 - 3381)/4764
mul_uint32 = (4734 - 3414)/4734
mul_uint64 = (12671 - 11287)/12671
mul_uint8 = (3314 - 1995)/3314

sub_float32 = (5160 - 3811)/5160
sub_float64 = (5910 - 4629)/5910
sub_int16 = (1903 - 646)/1903
sub_int32 = (1839 - 646)/1839
sub_int64 = (2415 - 1030)/2415
sub_int8 = (1839 - 646)/1839
sub_uint16 = (1903 - 646)/1903
sub_uint32 = (1839 - 646)/1839
sub_uint64 = (2415 - 1030)/2415
sub_uint8 = (1839 - 646)/1839

print(f"add_float32 {add_float32*100:.01f}%")
print(f"add_float64 {add_float64*100:.01f}%")
print(f"add_int16 {add_int16*100:.01f}%")
print(f"add_int32 {add_int32*100:.01f}%")
print(f"add_int64 {add_int64*100:.01f}%")
print(f"add_int8 {add_int8*100:.01f}%")
print(f"add_uint16 {add_uint16*100:.01f}%")
print(f"add_uint32 {add_uint32*100:.01f}%")
print(f"add_uint64 {add_uint64*100:.01f}%")
print(f"add_uint8 {add_uint8*100:.01f}%")
print()
print(f"div_float32 {div_float32*100:.01f}%")
print(f"div_float64 {div_float64*100:.01f}%")
print(f"div_int16 {div_int16*100:.01f}%")
print(f"div_int32 {div_int32*100:.01f}%")
print(f"div_int64 {div_int64*100:.01f}%")
print(f"div_int8 {div_int8*100:.01f}%")
print(f"div_uint16 {div_uint16*100:.01f}%")
print(f"div_uint32 {div_uint32*100:.01f}%")
print(f"div_uint64 {div_uint64*100:.01f}%")
print(f"div_uint8 {div_uint8*100:.01f}%")
print()
print(f"mul_float32 {mul_float32*100:.01f}%")
print(f"mul_float64 {mul_float64*100:.01f}%")
print(f"mul_int16 {mul_int16*100:.01f}%")
print(f"mul_int32 {mul_int32*100:.01f}%")
print(f"mul_int64 {mul_int64*100:.01f}%")
print(f"mul_int8 {mul_int8*100:.01f}%")
print(f"mul_uint16 {mul_uint16*100:.01f}%")
print(f"mul_uint32 {mul_uint32*100:.01f}%")
print(f"mul_uint64 {mul_uint64*100:.01f}%")
print(f"mul_uint8 {mul_uint8*100:.01f}%")
print()
print(f"sub_float32 {sub_float32*100:.01f}%")
print(f"sub_float64 {sub_float64*100:.01f}%")
print(f"sub_int16 {sub_int16*100:.01f}%")
print(f"sub_int32 {sub_int32*100:.01f}%")
print(f"sub_int64 {sub_int64*100:.01f}%")
print(f"sub_int8 {sub_int8*100:.01f}%")
print(f"sub_uint16 {sub_uint16*100:.01f}%")
print(f"sub_uint32 {sub_uint32*100:.01f}%")
print(f"sub_uint64 {sub_uint64*100:.01f}%")
print(f"sub_uint8 {sub_uint8*100:.01f}%")


# branch acc analysis

In [None]:
perf_est_dir = "../src"
perf_est_ext = "sorting*_perf_est.csv"
perf_est_ext = "vector*_perf_est.csv"
#perf_est_ext = "*_perf_est.csv"
# glob for all perf_est_ext files in perf_est_dir
perf_est_files = sorted(glob.glob(f'{perf_est_dir}/*{perf_est_ext}'))

# create a single df from all perf_est_ext files
df = pd.concat([pd.read_csv(f) for f in perf_est_files], ignore_index=True)
df = df.drop(columns=["mispredict_penalty", "prediction_resolution", "pipeline_latency", "cpu_frequency_mhz", "cpu_period"])
#df['name'] = df['name'].str.replace("vector_basic_", "").str.replace("_inst_profiler.json", "")
df['name'] = df['name'].str.replace("_inst_profiler.json", "")
display(df.head())

op_order = ['add', 'sub', 'mul', 'div', 'bubble', 'insertion', 'selection', 'merge', 'quick', 'heap']
df['op_sort'] = df['name'].str.split('_').str[-2]
df['nf'] = df['name'].str.split('_').str[-1]
df['op_sort'] = pd.Categorical(df['op_sort'], op_order)
df = df.sort_values(by=['op_sort', 'nf'], ascending=[False, True])
df = df.drop(columns=['op_sort', 'nf'])

#op_order = ['bubble', 'insertion', 'selection', 'merge', 'quick', 'heap']
#df['op_sort'] = df['name'].str.split('_').str[1]
#df['nf'] = df['name'].str.split('_').str[2]
#df['op_sort'] = pd.Categorical(df['op_sort'], op_order)
#df = df.sort_values(by=['op_sort', 'nf'], ascending=[False, True])
#df = df.drop(columns=['op_sort', 'nf'])

df['non_b_inst'] = df.inst_total - df.b_inst

df.head()

In [None]:
# check data types for columns
df.dtypes

In [None]:
hcl_mac = {
    "add": "#3ECCBB", # turquoise
    "sub": "#EED595", # peach yellow
    #"mul": "#f4a261", # orange
    "mul": "#979797", # gray
    #"mul": "#3d91dc", # blue
    "div": "#e76f51", # red
    "root": "#9c86ae" # purple
}

hcl_sort = {
    "bubble": "#3ECCBB", # turquoise
    "insertion": "#EED595", # peach yellow
    #"mul": "#f4a261", # orange
    "selection": "#979797", # gray
    #"mul": "#3d91dc", # blue
    "merge": "#e76f51", # red
    "quick": "#9c86ae", # purple
    "heap": "#3d91dc", # blue
}

In [None]:
df.columns

In [None]:
# plot name vs acc

#box = ax.barh(df['name'], df['acc'])
#ax.set_xlim(0, 100)
#ax.xaxis.set_major_locator(MultipleLocator(10))
#ax.set_xlabel("Accuracy (%)")
#ax.bar_label(box, padding=3, fmt='%.1f')
#ax.set_title("Accuracy of Instruction Prediction")

perc_metrics = ['acc', 'branches_perc', 'speedup']
for metric in [['acc'], ['total_cycles', 'saved_cycles'], ['speedup'], ['non_b_inst', 'b_inst'], ['branches_perc'], ['original_cpi', 'new_cpi']]: # TODO: bar chart combined
    box = []
    factor = 2
    offset = [0]
    if len(metric) == 2:
        factor = 1
        offset = [0.25, -0.25]
    fig, ax = plt.subplots(figsize=(12, df.index.size/(2*factor)))
    y_axis_numeric = np.arange(df.index.size)
    for m,o in zip(metric,offset):
        box.append(ax.barh(y_axis_numeric-o, df[m], height=0.45*factor))



    # annotate the y-aixs
    ax.set_yticks(y_axis_numeric)
    ax.set_yticklabels(df['name'])

    label = metric[0] if len(metric) == 1 else f"{metric[0]} vs {metric[1]}"
    ax.set_xlabel(label)
    ax.set_title(label)
    #ax.set_xscale('log')

    fmt = '%.0f'
    # set y lim to 100 for percentage metrics
    if metric[0] in perc_metrics:
        ax.set_xlim(0, 100)
        ax.xaxis.set_major_locator(MultipleLocator(10))
        ax.set_xlabel(f"{label} (%)")
        fmt = lambda x: '{:.1f}%'.format(x)

    if "cpi" in label:
        fmt = '%.2f'

    ax.set_ylabel("Test")
    ax.margins(y=0.01)
    ax.grid(axis='x')
    for cmp, b in enumerate(box):
        ax.bar_label(b, padding=3, fmt=fmt)
        for i, r in enumerate(b):
            if ("vector_basic" in df.iloc[i]['name'] or "sorting" in df.iloc[i]['name']):
                op = df.iloc[i]['name'].split('_')[-2]
                nf = df.iloc[i]['name'].split('_')[-1]
                if op in hcl_mac:
                    r.set_color(hcl_mac[op])
                elif op in hcl_sort:
                    r.set_color(hcl_sort[op])

                # won't actually do anything alone, but hatch will be black, if added
                r.set_edgecolor("black")
                r.set_linewidth(0.0)

                #if nf.startswith("int"):
                #    r.set_hatch("//")
                if "float" in nf:
                    r.set_alpha(0.7)

                if cmp == 1:
                    r.set_hatch("...")


    plt.show()

In [None]:
# get mean values for df for all numeric columns only
df.describe().loc['mean']

In [None]:
float_means = df[df.name.str.contains("float")].describe().loc['mean']


In [None]:
int_means = df[df.name.str.contains("int")].describe().loc['mean']

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))
bar_width = 0.35
y_axis_numeric = np.arange(float_means.size)
bar1 = ax.barh(y_axis_numeric+.2, float_means, bar_width, label='float')
bar2 = ax.barh(y_axis_numeric-.2, int_means, bar_width, label='int')
ax.set_yticks(y_axis_numeric)
ax.set_yticklabels(float_means.index)
ax.set_ylabel('Mean')
# set x axis as log
ax.set_xscale('log')
ax.grid(axis='x', which='major')
ax.grid(axis='x', which='minor', linewidth=0.4, alpha=0.8)
ax.legend()


In [None]:
for idx, (f, i) in enumerate(zip(float_means, int_means)):
    print(f"{float_means.index[idx]}: {f:.2f} vs {i:.2f}")

# GS linear

In [None]:
#x = np.array([9, 19, 47]) + 8
#y1 = [0.3, 0.2, 0.1]
#y2 = [0.85, 0.9, 0.93]
x = np.array([9, 47]) + 8
y1 = [0.3, 0.08]
y2 = [0.85, 0.94]

fig, ax = plt.subplots(1,2, figsize=(7,3))
ax[0].plot(x, y1, label='No exec check')
ax[1].plot(x, y2, label='Exec check')
ax[0].grid()
ax[1].grid()

In [None]:
prec = 4
m1 = round((y1[1] - y1[0]) / (x[1] - x[0]),prec)
b1 = round(y1[0] - m1 * x[0],prec)
m2 = round((y2[1] - y2[0]) / (x[1] - x[0]),prec)
b2 = round(y2[0] - m2 * x[0],prec)

for val in [m1, b1, m2, b2]:
    print(f"{val}")

# Define the linear functions
def linear_y1(x):
    return round(m1 * x + b1,prec)

def linear_y2(x):
    return round(m2 * x + b2,prec)

# Example usage:
for x_val in [20, 30, 40, 50]:
    print(f"For x = {x_val}, y1 = {linear_y1(x_val)}, y2 = {linear_y2(x_val)}")

In [None]:
# Define the linear functions
def linear_y1(x):
    return m1 * x + b1

def linear_y2(x):
    return m2 * x + b2

# Example usage:
for x in [20, 30, 40, 50]:
    print(f"For x = {x}, y1 = {linear_y1(x):.2f}, y2 = {linear_y2(x):.2f}")

In [None]:
m1 = -0.005
b1 = 0.385
m2 = 0.002
b2 = 0.816