# 0. Sequential run 

## Config

In [2]:
input = "llama_3.bin"
steps = "500"


### Running this Notebook

To run this notebook, make sure you've installed the dependencies listed in `requirements.txt`:

```bash
pip install -r requirements.txt
```

You also need to download the model binary:

```bash
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin
```


## Generate measurements

In [1]:
import subprocess,os
import datetime
from pathlib import Path

commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
env = os.environ.copy()
print("GIT_COMMIT_HASH", commit_hash)
env["GIT_COMMIT_HASH"] = commit_hash


Path("measurements").mkdir(exist_ok=True)

date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"measurements/output_{input}_seq_{date}.json"

subprocess.run(["make", "cliftinstr"], check=True)


with open(output_path, "w") as f:
    subprocess.run(["./cliftinstr", "-m", "llama3.2_1b.bin", "-n", "20", "-p", "The son applies for a credit card with an identity already stolen. He takes the card and spends almost all"], stdout=f, check=True,env=env)

file = output_path

GIT_COMMIT_HASH a949d894c60358eab2f04032e87af816e4a56262
gcc -O3 -ffast-math -fno-omit-frame-pointer -Wall -o clift clift.c -lm -DINSTR


clift.c: In function ‘transformer_forward’:
  536 |   double st = 0;
      |          ^~
clift.c: In function ‘transformer_forward_inlined’:
  802 | #pragma omp single
      | 
  814 | #pragma omp single
      | 
  839 | #pragma omp for collapse(2) nowait
      | 
In file included from clift.c:21:
  852 |     STOP_3(st, MATMUL_QKV, omp_get_thread_num(), past);
      |                            ^~~~~~~~~~~~~~~~~~
instrumentor.h:93:49: note: in definition of macro ‘STOP_3’
   93 | #define STOP_3(st, i1, i2, i3) tab_values_3[i1][i2][i3] += get_time() - st
      |                                                 ^~
  856 | #pragma omp for collapse(2) nowait
      | 
  872 | #pragma omp for collapse(2) nowait
      | 
  886 | #pragma omp barrier
      | 
  888 | #pragma omp for collapse(3) nowait
      | 
  904 | #pragma omp for collapse(3) nowait
      | 
  923 | #pragma omp for collapse(3) nowait
      | 
  970 | #pragma omp for collapse(3) nowait
      | 
  982 | #pragma omp barrier
    

CalledProcessError: Command '['make', 'cliftinstr']' returned non-zero exit status 2.

In [6]:
import json

def print_json_schema(data, indent=0):
    prefix = '  ' * indent
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{prefix}{key}: {type(value).__name__}")
            print_json_schema(value, indent + 1)
    elif isinstance(data, list):
        print(f"{prefix}- list of {len(data)} items")
        if data:
            print_json_schema(data[0], indent + 1)
    else:
        # leaf node
        pass

with open(file) as f:
    data = json.load(f)

print_json_schema(data)

JSONDecodeError: Extra data: line 46 column 1 (char 5145)

In [5]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

# On récupère le sous-dictionnaire JSON_1
json1 = data["JSON_1"]

# Création dynamique du DataFrame
df = pd.DataFrame(list(json1.items()), columns=["phase", "time (ms)"])

# Affichage en bar chart
fig = px.bar(
    df,
    x="phase",
    y="time (ms)",
    title="TIming of different sections",
    width=600
)
fig.show()


In [6]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

df = pd.DataFrame({
    "component": list(json2.keys()),
    "time (ms)": [values[0] if isinstance(values, list) and values else None for values in json2.values()]
})

df = df.dropna()

fig = px.bar(
    df,
    x="component",
    y="time (ms)",
    title="Component time for PROMPT PROCESSING",

    width=800,
    text_auto=".2f"
)
fig.show()


In [7]:
df_2 = pd.DataFrame({
    "component": list(json2.keys()),
    "time (ms)": [values[1] if isinstance(values, list) and values else None for values in json2.values()]
})
df = df.dropna()

fig = px.bar(
    df,
    x="component",
    y="time (ms)",
    title="Component time for CACHE LOADING",
    log_y=True,
    width=800,
    text_auto=".2f"
)
fig.show()

In [None]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

df = pd.DataFrame({
    "component": list(json2.keys()),
    "time (ms)": [
        sum(values[2:]) / len(values[2:]) if isinstance(values, list) and len(values) > 2 else None
        for values in json2.values()
    ]
})

df = df.dropna()

fig = px.bar(
    df,
    x="component",
    y="time (ms)",
    title="Average component time for TOKEN GENERATION",

    text_auto='.2f',
    width=800
)
fig.show()


In [9]:
total = sum(values[0] for values in json2.values() if isinstance(values, list) and values)
print(total)

4342.627600000001


In [10]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

rows = []
for component, values in json2.items():
    if not isinstance(values, list):
        continue
    cleaned = [v for v in values[2:] if v != 0]
    for i, v in enumerate(cleaned):
        rows.append({
            "step": i,
            "time": v,
            "component": component
        })

df = pd.DataFrame(rows)

fig = px.line(
    df,
    x="step",
    y="time",
    color="component",
    markers=True,
    labels={"time": "time (ms)"},
    title="Per-step time per component during token generation"
)

fig.show()


In [11]:
import json
import numpy as np
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

forward = data["forward_instr"]
keys = [
    "rmsnorm_first_time",
    "matmul_qkv_time",
    "rope_time",
    "multihead_time",
    "matmul_output_attention_time",
    "FFN_rmsnorm_time",
    "matmul_FFN_time",
    "swiGLU_time",
    "matmul_output_FFN_time",
    "rmsnorm_final_time",
    "matmul_logits_time"
]

rows = []
for k in keys:
    values = [v for v in forward[k] if v != 0]
    rows += [(k, v) for v in values]

df = pd.DataFrame(rows, columns=["phase",  "time (in ms)"])

px.box(df, x="phase", y="time (in ms)",log_y = False,title = "Boxplot of difference phase per execution")


KeyError: 'forward_instr'

In [None]:
import json
import numpy as np
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

first_values = {
    k: v[0] for k, v in json2.items()
    if isinstance(v, list) and len(v) > 0
}
total_first = sum(first_values.values())
percent_first = {
    k: v / total_first * 100 for k, v in first_values.items()
}

df_first = pd.DataFrame({
    "component": list(percent_first.keys()),
    "percentage": list(percent_first.values())
}).sort_values("percentage", ascending=False)

fig1 = px.bar(
    df_first,
    x="component",
    y="percentage",
    title="Percentage of time for PROMPT_PROCESSING",
    text_auto=".2f"
)
fig1.show()

main_values = {
    k: sum(v[2:]) for k, v in json2.items()
    if isinstance(v, list) and len(v) > 2
}
total_main = sum(main_values.values())
percent_main = {
    k: v / total_main * 100 for k, v in main_values.items()
}

df_main = pd.DataFrame({
    "component": list(percent_main.keys()),
    "percentage": list(percent_main.values())
}).sort_values("percentage", ascending=False)

fig2 = px.bar(
    df_main,
    x="component",
    y="percentage",
    title="Percentage of time for TOKEN_GENERATION",
    text_auto=".2f"
)
fig2.show()


# 1. Multi thread comparison

## 2. ROPE Comparaison

In [None]:
file1 = "measurements/output_llama_3.bin_seq_20250617_165939.json"
file2 = "measurements/output_llama_3.bin_seq_20250618_174751.json"

In [None]:
import json
import pandas as pd
import numpy as np
import plotly.express as px


def extract_metrics(path):
    with open(path) as f:
        data = json.load(f)
        rope = data["JSON_2"]["ROPE"]
        return {"PROMPT_PROCESSING": rope[0], "TOKEN_GENERATION": np.mean(rope[2:])}


metrics1 = extract_metrics(file1)
metrics2 = extract_metrics(file2)

speedups = {"stage": [], "speedup (%)": [], "delta" : []}

for key in ["PROMPT_PROCESSING", "TOKEN_GENERATION"]:
    ratio = 100 * metrics1[key] / metrics2[key]
    speedups["stage"].append(key)
    speedups["speedup (%)"].append(ratio)
    speedups["delta"].append(metrics2[key] - metrics1[key])

df = pd.DataFrame(speedups)

fig = px.bar(df, x="stage", y="speedup (%)", title="ROPE speedup with pre-computed Rope", width=500)
fig.show()

In [None]:
df

Unnamed: 0,stage,speedup (%),delta
0,PROMPT_PROCESSING,749.676106,-3.5102
1,TOKEN_GENERATION,280.97827,-0.064743


## 2. Parralel Execution

In [40]:
import subprocess, os
import datetime
from pathlib import Path

commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
env_base = os.environ.copy()
env_base["GIT_COMMIT_HASH"] = commit_hash

Path("measurements").mkdir(exist_ok=True)
subprocess.run(["make", "ompinstr"], check=True)

thread_counts = [1, 2, 4, 8, 10, 12]
date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"measurements/output_concat_{date}.jsons"  # <- extension explicite

prompt = "The son applies for a credit card with an identity already stolen. He takes the card and spends almost all"
model_path = "llama3.2_1b.bin"
prompt_file = ["prompt/file_16.txt","prompt/file_32.txt","prompt/file_64.txt","prompt/file_128.txt","prompt/file_256.txt","prompt/file_512.txt","prompt/file_1024.txt"]
with open(output_path, "w") as f:
    for file in prompt_file:
        for n_threads in thread_counts:
            proc = subprocess.run(
                ["./cliftinstr", "-m", model_path, "-n", "20", "-f", file,"-t", str(n_threads)],
                stdout=subprocess.PIPE,
                check=True,
                env=env
            )

            f.write(proc.stdout.decode().strip())
            f.write("\n---\n")
file = output_path


gcc -O3 -DINSTR -ffast-math -fno-omit-frame-pointer -Wall -fopenmp -march=native clift.c  -lm  -o cliftinstr


clift.c: In function ‘transformer_forward’:
  540 |   double st = 0;
      |          ^~
clift.c: In function ‘generate’:
 1762 |           char* piece = tokenizer_decode(tokenizer, current, next);
      |                 ^~~~~
 1792 |           char* piece = tokenizer_decode(tokenizer, current, next);
      |                 ^~~~~
clift.c: In function ‘main’:
 1962 |     fread(file_prompt, 1, fsize, pf);
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
transformer_t configuration:
- embedding_dim:  2048
- hidden_dim:     8192
- layer_count:    16
- q_head_count:   32
- kv_head_count:  8
- vocabulary_len: 128256
- context_len:    2048
runtime setup:
- num_threads:    1
Sequence (34 tokens):
128000 2356 220 81101 320 8921 1764 2840 3105 8 34388 49831 406 3869 326 529 541 265 951 220 68641 78 83649 83 2552 1880 3869 1208 67628 951 2211 102843 62306 5512 
[1;34mLe lama (Lama glama) appartient à l’ordre des artiodactyles et à la famille des camélidés[0mPrompt processing (prefill): 34 tokens 

CalledProcessError: Command '['./cliftinstr', '-m', 'llama3.2_1b.bin', '-n', '20', '-f', 'prompt/file_1024.txt', '-t', '1']' died with <Signals.SIGSEGV: 11>.

In [1]:
file = "measurements/output_concat_20250624_132950.jsons"

In [44]:
import json



with open(file, "r") as f:
    content = f.read()

chunks = content.split("\n---\n")
parsed = [json.loads(chunk) for chunk in chunks if chunk.strip()]
print_json_schema(parsed[0])

commit: str
JSON_1_COMPT: dict
  PROMPT_LEN: float
  TOKEN_GENERATED: float
  NUM_THREADS: float
JSON_1: dict
  GENERATE_TIME: float
  PROMPT_PROCESSING_TIME: float
  TOKEN_GENERATION_TIME: float
JSON_2: dict
  RMSNORM_INIT: list
    - list of 54 items
  FFN_RMSNORM: list
    - list of 54 items
  FINAL_RMSNORM: list
    - list of 54 items
JSON_3: dict
  MATMUL_QKV: list
    - list of 1 items
      - list of 54 items
  ROPE: list
    - list of 1 items
      - list of 54 items
  ATTENTION_COMPUTATION: list
    - list of 1 items
      - list of 54 items
  MATMUL_OUTPUT_ATTENTION: list
    - list of 1 items
      - list of 54 items
  MATMUL_FFN: list
    - list of 1 items
      - list of 54 items
  SwiGLU: list
    - list of 1 items
      - list of 54 items
  MATMUL_OUTPUT_FFN: list
    - list of 1 items
      - list of 54 items
  MATMUL_LOGITS: list
    - list of 1 items
      - list of 54 items


In [19]:
import json, pandas as pd, plotly.express as px

with open(file) as f:
    parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

parsed.sort(key=lambda d: d["JSON_1_COMPT"]["NUM_THREADS"])
threads = [d["JSON_1_COMPT"]["NUM_THREADS"] for d in parsed]

gen = [d["JSON_1"]["GENERATE_TIME"] for d in parsed]
prompt = [d["JSON_1"]["PROMPT_PROCESSING_TIME"] for d in parsed]
token = [d["JSON_1"]["TOKEN_GENERATION_TIME"] for d in parsed]

df = pd.DataFrame({
    "Threads": threads,
    "GENERATE_TIME": [gen[0]/x for x in gen],
    "PROMPT_PROCESSING_TIME": [prompt[0]/x for x in prompt],
    "TOKEN_GENERATION_TIME": [token[0]/x for x in token],
    "Ideal": threads
})

px.line(df, x="Threads", y=["GENERATE_TIME", "PROMPT_PROCESSING_TIME", "TOKEN_GENERATION_TIME", "Ideal"],title="Global Speedup", markers=True).show()


In [30]:
file = "measurements/output_concat_20250624_094845.jsons"

In [26]:
import json, pandas as pd, plotly.express as px

with open(file) as f:
    parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

data = []
for entry in parsed:
    t = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
    for phase, matrix in entry["JSON_3"].items():
        for i in range(t):
            val = matrix[i][0]
            data.append({"Phase": phase, "Time (ms)": val, "Threads": f"{t} threads"})

df = pd.DataFrame(data)
fig = px.box(df, x="Phase", y="Time (ms)", color="Threads", points="all", title="Prompt Processing Time per Phase")
fig.show()


In [34]:
import json, pandas as pd, plotly.express as px



with open(file) as f:
    parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

phase_mean_times = {}

for entry in parsed:
    t = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
    for phase, matrix in entry["JSON_3"].items():
        mean_time = sum(matrix[i][0] for i in range(t))/t
        phase_mean_times.setdefault(phase, []).append((t, mean_time))

records = []
for phase, values in phase_mean_times.items():
    values.sort()
    baseline = values[0][1]
    for t, time in values:
        speedup = baseline / time if time != 0 else None
        records.append({"Phase": phase, "Threads": t, "Speedup": speedup})

df = pd.DataFrame(records)
fig = px.line(df, x="Threads", y="Speedup", color="Phase", markers=True, title="Speedup (max thread time) per Phase")
fig.show()


In [15]:
import json, pandas as pd, plotly.express as px

def line_tokens_per_thread(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    all_data = []
    for entry in parsed:
        prompt_len = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        tokens = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        for phase, matrix in entry["JSON_3"].items():
            if phase != "MATMUL_LOGITS":
                val = sum(matrix[i][0] for i in range(threads)) / threads
                tokens_per_s = (tokens / val) * 1e3 if val != 0 else 0
                all_data.append({
                    "Prompt length": prompt_len,
                    "Phase": phase,
                    "Threads": threads,
                    "Tokens/s": tokens_per_s
                })

    df = pd.DataFrame(all_data)

    # Calcul du gain en % par rapport à la plus petite longueur de prompt (par Threads et Phase)
    df["Gain (%)"] = 0.0
    for (threads, phase), group in df.groupby(["Threads", "Phase"]):
        group_sorted = group.sort_values("Prompt length")
        ref_val = group_sorted.iloc[0]["Tokens/s"]
        df.loc[group_sorted.index, "Gain (%)"] = (group_sorted["Tokens/s"] / ref_val - 1) * 100

    for thread_value in sorted(df["Threads"].unique()):
        df_thread = df[df["Threads"] == thread_value]
        fig = px.line(
            df_thread,
            x="Prompt length",
            y="Gain (%)",
            color="Phase",
            markers=True,
            title=f"Gain en % vs prompt minimal — {thread_value} thread(s)"
        )
        fig.write_image(f"measurements/omp/gain_token_s_{thread_value}_thread.png")
        fig.show()


In [16]:
line_tokens_per_thread(file)

In [9]:
import json, pandas as pd, plotly.express as px

def Speed_up_prompt(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    grouped = {}
    for entry in parsed:
        prompt_len = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        for phase, matrix in entry["JSON_3"].items():
            val = sum(matrix[i][0] for i in range(threads)) / threads
            grouped.setdefault((phase, prompt_len), []).append((threads, val))

    records = []
    for (phase, prompt_len), values in grouped.items():
        values.sort()
        baseline = values[0][1]
        for t, time in values:
            speedup = baseline / time if time != 0 else None
            records.append({
                "Phase": phase,
                "Prompt length": prompt_len,
                "Threads": t,
                "Speedup": speedup
            })

    df = pd.DataFrame(records)

    for prompt_len in sorted(df["Prompt length"].unique()):
        sub_df = df[df["Prompt length"] == prompt_len].copy()

        ideal = pd.DataFrame({
            "Threads": sorted(sub_df["Threads"].unique()),
            "Speedup": sorted(sub_df["Threads"].unique()),
            "Phase": ["Ideql"] * len(sub_df["Threads"].unique())
        })

        full_df = pd.concat([sub_df, ideal], ignore_index=True)

        fig = px.line(
            full_df,
            x="Threads",
            y="Speedup",
            color="Phase",
            markers=True,
            title=f"Speedup vs Threads — Prompt Length {prompt_len}"
        )
        fig.write_image(f"measurements/omp/speedup_{prompt_len}.png")
        fig.show()


In [10]:
Speed_up_prompt(file)

In [2]:
import json, pandas as pd, plotly.express as px

def tokens_line_chart(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    data = []
    for entry in parsed:
        prompt_len = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        tokens = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        generate_time = entry["JSON_1"]["PROMPT_PROCESSING_TIME"]  # en ms
        tokens_per_s = (tokens / generate_time) * 1e3 if generate_time > 0 else 0

        data.append({
            "Prompt length": prompt_len,
            "Threads": f"{threads} threads",
            "Tokens/s": tokens_per_s
        })

    df = pd.DataFrame(data)
    fig = px.line(
        df,
        x="Prompt length",
        y="Tokens/s",
        color="Threads",
        markers=True,
        title="Tokens/s (Prompt Processing) selon la taille du prompt et le nombre de threads"
    )
    fig.write_image(f"measurements/omp/tokens_line_chart.png")
    fig.show()


In [7]:
!! plotly_get_chrome -y

['Installing Chrome for Plotly...',
 'Chrome installed successfully.',
 'The Chrome executable is now located at: /home/emorel/shared/clift/.venv/lib/python3.12/site-packages/choreographer/cli/browser_exe/chrome-linux64/chrome']

In [8]:
tokens_line_chart(file)