In [1]:
%matplotlib inline
import json
import collections
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from statsmodels.nonparametric.kernel_regression import KernelReg

import seaborn as sns
import matplotlib.patches as mpatches
from matplotlib.ticker import FuncFormatter
from matplotlib import font_manager

from scipy.stats import gaussian_kde

In [2]:
# Set your directories accordingly
# Please adjust the following paths as needed
data_dir = "/mnt/hdd1/chenyang/benchmark_data/matrix_resource/data-all" 
# mat_dir = "/u/3/yibo/exp-result/0808-old/solver-mat" # mat data
mat_dir = "/mnt/hdd1/chenyang/benchmark_data/matrix_resource/solver-mat-0906"
result_dir = "/u/1/chenyang/benchmark_data/exp-result-yibo/20250617"  # experiment metrics apart from mat
save_dir = "/u/1/chenyang/benchmark/analysis/output_test"
# solver_list = ["Pardiso", "Trilinos", "Trilinos-nullspace", "AMGCL", "Hypre"]
# solver_list = ["Pardiso", "AMGCL", "Hypre"]
solver_list = ["Hypre", "AMGCL", "Eigen::PardisoLDLT"]

if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
# solver_list = [solver_list[2]]
# print(solver_list)

### Extract data

In [3]:
def get_mat_sz(fp):
    try:
        # the first 8 values are: dim, is_spd, is_sequence, nrow, ncol, nnz, outer_sz, inner_sz
        # nnz, number of non-zero element
        meta = np.fromfile(fp, dtype=np.int32, count=8, offset=0)
        n_rows = meta[3]
        return n_rows
    except:
        print("mat size exception")
        return None

def get_nnz(fp):
    try:
        meta = np.fromfile(fp, dtype=np.int32, count=8, offset=0)
        nnz = meta[5]
        return nnz
    except:
        print("nnz exception")
        return None

def get_density(nnz, mat_sz):
    if nnz is None or mat_sz is None:
        return None
    else:
        nnz =  np.float128(nnz)
        mat_sz =  np.float128(mat_sz)
        return (nnz + 1e-10) / (mat_sz * mat_sz + 1e-10)

def get_sparsity(nnz, mat_sz):
    if nnz is None or mat_sz is None:
        return None
    else:
        nnz =  np.float128(nnz)
        mat_sz =  np.float128(mat_sz)
        return (mat_sz * mat_sz - nnz) / (mat_sz * mat_sz)

def extract_data(fname):
    if not os.path.exists(fname):
        return None
    time_li = []
    mem_li = []
    with open(fname, 'r') as f:
        for line in f.readlines():
            time = float(line.split(" ")[4])
            mem = float(line.split(" ")[5])
            time_li.append(time)
            mem_li.append(mem)
    if len(time_li) == 0:
        return None
    time_avg = sum(time_li) / len(time_li)
    mem_avg = sum(mem_li) / len(mem_li)
    return {"time": time_avg, "mem": mem_avg}    

def extract_data_trilinos(fname):
    if not os.path.exists(fname):
        return None
    time_li = []
    mem_li = []
    with open(fname, 'r') as f:
        for line in f.readlines():
            if "[BOS]" in line:
                time = float(line.split(" ")[0].strip("[BOS]").strip("[EOS]"))
                mem = float(line.split(" ")[1])
                time_li.append(time)
                mem_li.append(mem)
    if len(time_li) == 0:
        return None
    time_avg = sum(time_li) / len(time_li)
    mem_avg = sum(mem_li) / len(mem_li)
    return {"time": time_avg, "mem": mem_avg}


In [4]:
def sequence_data(fname):
    if not os.path.exists(fname):
        print(fname)
        return None    
    seq = []
    with open(fname, 'r') as f:
        for line in f.readlines():
            try:
                time = float(line.split(" ")[4])
                mem = float(line.split(" ")[5])
                seq.append([time, mem])
            except:
                seq.append([-1, -1])
                continue
    if len(seq) == 0:
        return None
    return seq

def sequence_data_trillinos(fname):
    if not os.path.exists(fname):
        return None    
    seq = []
    with open(fname, 'r') as f:
        for line in f.readlines():
            if "[BOS]" in line:
                time = float(line.split(" ")[0].strip("[BOS]").strip("[EOS]"))
                mem = float(line.split(" ")[1])
                seq.append([time, mem])
            elif "Aborted (core dumped)" in line:
                seq.append([-1, -1])

    if len(seq) == 0:
        return None
    return seq

def sequence_data_hypre(fname):
    if not os.path.exists(fname):
        return None    
    seq = []
    with open(fname, 'r') as f:
        for line in f.readlines():
            if "hypre error" in line:
                seq.append([-1, -1])
                continue
            if "hypre" not in line:
                try:
                    time = float(line.split(" ")[4])
                    mem = float(line.split(" ")[5])
                    seq.append([time, mem])
                except:
                    continue

    if len(seq) == 0:
        return None
    return seq

def check_bin_log(bin_path, log_file):
    start_i = 1
    start_j = 1
    end_i = 10000
    end_j = 10000

    
#     if os.path.exists(log_file):
#         with open(log_file,"r")as f:
#             last_line=f.readlines()[-1]
#             last_i=int(last_line.split(" ")[1])
#             last_j=int(last_line.split(" ")[2])
#         start_i=last_i+1
#         start_j=last_j+1
    
    search_i = 1
    while(search_i < 10000):
        if os.path.exists(os.path.join(bin_path,"%d_%d_A.bin"%(search_i,1))):
            break
        search_i += 1

    start_i = search_i

    seq = []    
    i = start_i
    j = start_j
    fp_a=os.path.join(bin_path,"%d_%d_A.bin"%(i,j))
    fp_b=os.path.join(bin_path,"%d_%d_b.bin"%(i,j))
    
    while(os.path.exists(fp_a) and i < end_i):
        while(os.path.exists(fp_a) and j < end_j):
#     while(i < end_i):
#         while(j < end_j):
            if (os.path.exists(fp_a)):
                seq.append([i,j,fp_a, fp_b])
            j += 1
            fp_a=os.path.join(bin_path,"%d_%d_A.bin"%(i,j))
            fp_b=os.path.join(bin_path,"%d_%d_b.bin"%(i,j))
        j = start_j
        i += 1
        fp_a=os.path.join(bin_path,"%d_%d_A.bin"%(i,j))
        fp_b=os.path.join(bin_path,"%d_%d_b.bin"%(i,j))
        
        
    return seq


In [5]:
with open(os.path.join(data_dir, "filenames.json"), "r") as f:
    filenames = json.load(f)
    prenames = filenames["prenames"] # experiment name
    fpaths = filenames["fpaths"] # corresponding json file

    data_all = []
#     for solver in solver_list:
#         data_all[solver] = []

    for fpath, prename in zip(fpaths, prenames):
        
        for solver in solver_list:           
            fp = "%s/%s/%s.log" % (result_dir, solver, prename)
            if solver == "Trilinos" or solver == "Trilinos-nullspace":
                seq_data = sequence_data_trillinos(fp)
            elif solver == "Hypre":
                seq_data = sequence_data_hypre(fp)
            else:
                seq_data = sequence_data(fp)

            if seq_data == None:
                print("no log for this prename: ", prename, solver)
                break
#             print("seq_data lens: ", len(seq_data), "prename: ",prename )
            
            bin_path = os.path.join(mat_dir, prename)
            bin_logged = check_bin_log(bin_path, fp)
#             print("bin_logged lens: ", len(bin_logged), "prename: ",prename )

            if len(seq_data) != len(bin_logged):
                print(solver, prename, "lens mismatching", "seq_data: ", len(seq_data), " bin_logged: ", len(bin_logged))
                print(bin_path)
#                 break
#             else:
#                 print(solver, prename, "lens matching", "seq_data: ", len(seq_data), " bin_logged: ", len(bin_logged))
                
            for [time, mem], bin_file in zip(seq_data , bin_logged):
                single_trial = {}
                single_trial["fpath"] = fpath
                single_trial["prename"] = prename
                single_trial["outer_i"] = bin_file[0]
                single_trial["inner_j"] = bin_file[1]
                single_trial["solver"] = solver
                single_trial["time"] = time * 1000 #unit mili seconds
                single_trial["mem"] = mem
                single_trial["mat_sz"] = get_mat_sz(bin_file[2])
                single_trial["nnz"] = get_nnz(bin_file[2])
                single_trial["density"] = get_density(single_trial["nnz"], single_trial["mat_sz"])
                single_trial["sparsity"] = get_sparsity(single_trial["nnz"], single_trial["mat_sz"])
                data_all.append(single_trial)
                
#         for solver in solver_list:           
#             fp = "%s/%s/%s.log" % (result_dir, solver, prename)
#             if solver == "Trilinos" or solver == "Trilinos-nullspace":
#                 avg_data = extract_data_trilinos(fp)
#             elif solver == "Hypre":
#                 avg_data = extract_data_hypre(fp)
#             else:
#                 avg_data = extract_data(fp)
#             avg_data = extract_data(fp)
#             if avg_data is None:
#                 avg_data = {'time': 0.0, 'mem': 0.0}
#             avg_data["fpath"] = fpath
#             avg_data["prename"] = prename
#             avg_data["time"] = avg_data["time"] * 1000  # convert sec to ms
#             avg_data["mat_sz"] = get_mat_sz(os.path.join(mat_dir, prename, "1_1_A.bin"))
#             print(os.path.join(mat_dir, prename, "1_1_A.bin"))
#             avg_data["nnz"] = get_nnz(os.path.join(mat_dir, prename, "1_1_A.bin"))
#             avg_data["density"] = get_density(avg_data["nnz"], avg_data["mat_sz"])
#             avg_data["sparsity"] = get_sparsity(avg_data["nnz"], avg_data["mat_sz"])
#             data_all[solver].append(avg_data)

    columns = ["fpath",     
               "prename", 
               "outer_i",
               "inner_j",
               "solver",
               "time",
               "mem",
               "mat_sz",
               "nnz",
               "density",
               "sparsity"]

    writer = pd.ExcelWriter(os.path.join(save_dir, 'profiling.xlsx'))
#     for solver in solver_list:
#         df0 = pd.DataFrame(data_all[solver], columns=columns)
#         df0.to_excel(writer, sheet_name=solver)
    df_all = pd.DataFrame(data_all, columns=columns)
    
    columns_log = [
       "time",
       "mem",
       "mat_sz",
       "nnz",
       "density",
       "sparsity"]

    mask = (df_all[columns_log] > 0).all(axis=1)
    df_all_clean = df_all[mask].copy()
    
    for col in columns_log:
        df_all[col + "_log"] = np.log10(df_all_clean[col] + 1e-10)  # 防止 log(0)
#         df_all[col + "_log"] = np.log10(df_all[col].clip(lower=1e-10))
        
    print(df_all[[col + "_log" for col in columns_log]].head())    
    
    df_all.to_excel(writer)
    writer.close()

#     return data_all

no log for this prename:  friction-slope Hypre
AMGCL 3D-card-house lens mismatching seq_data:  798  bin_logged:  888
/mnt/hdd1/chenyang/benchmark_data/matrix_resource/solver-mat-0906/3D-card-house
no log for this prename:  kick Hypre
Hypre sphere-mat lens mismatching seq_data:  807  bin_logged:  809
/mnt/hdd1/chenyang/benchmark_data/matrix_resource/solver-mat-0906/sphere-mat
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz exception
mat size exception
nnz excep

### Plot data, time vs mat_size

In [6]:
print(len(df_all))
# print(df_all.iloc[0])

144385


In [7]:
matrix_size_list = df_now["mat_sz"].unique()
print(matrix_size_list)
print(sorted(matrix_size_list))


# filtered_df = df_now[df_now["mat_sz"] == 10462]
filtered_df = df_now[df_now["mat_sz"] == 131817]


filtered_df = filtered_df[filtered_df["time"]>0]

min_time_row = filtered_df.loc[filtered_df["time"].idxmin()]

max_time_row = filtered_df.loc[filtered_df["time"].idxmax()]

# Show results
print("Row with smallest time:\n", min_time_row)
print(min_time_row["fpath"])
print(min_time_row["time"])
print("\nRow with largest time:\n", max_time_row)
print(max_time_row["fpath"])
print(max_time_row["time"])


print(filtered_df["prename"].unique())
df_solver_select = df_now[df_now["prename"] == 'golf-ball-doformable-wall']
print(df_solver_select["mat_sz"].unique() )

NameError: name 'df_now' is not defined

Use the exp 'golf-ball-doformable-wall' to discover what factors that influence the time.

### Time vs mat size, all solver

In [None]:
df_now = df_all.copy()
# df_filtered = df_now[df_now["mat_sz"] >= 100].copy()
# df_now["mat_sz_log_rounded"] = df_filtered["mat_sz_log"].round(1)


colors = plt.cm.tab10.colors  # 或者 plt.cm.Set1, plt.cm.Paired 等等
solvers = df_now["solver"].unique()


def log_tick_formatter(val, pos=None):
    return r"$10^{%.1f}$" % val

plt.figure(figsize=(8, 6))

for i, solver in enumerate(solvers):
    group = df_now[df_now["solver"] == solver]
    plt.scatter(
        group["mat_sz_log"], group["time_log"],
        alpha=0.7, s=5,
        color=colors[i % len(colors)],
        label=solver
    )
plt.gca().xaxis.set_major_formatter(FuncFormatter(log_tick_formatter))
plt.gca().yaxis.set_major_formatter(FuncFormatter(log_tick_formatter))

plt.xlabel("Matrix Size (log scale)")
plt.ylabel("Time (log scale) ms")
plt.title("Time vs Matrix Size (log-log), 1 threads, wall clock time, (AMGCL not accelerated by CUDA)")
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.legend(title="Solver")
plt.tight_layout()
plt.show()
