# 0. Sequential run 

## Config

In [2]:
input = "llama_3.bin"
steps = "500"


### Running this Notebook

To run this notebook, make sure you've installed the dependencies listed in `requirements.txt`:

```bash
pip install -r requirements.txt
```

You also need to download the model binary:

```bash
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin
```


## Generate measurements

In [1]:
import subprocess,os
import datetime
from pathlib import Path

commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
env = os.environ.copy()
print("GIT_COMMIT_HASH", commit_hash)
env["GIT_COMMIT_HASH"] = commit_hash


Path("measurements").mkdir(exist_ok=True)

date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"measurements/output_{input}_seq_{date}.json"

subprocess.run(["make", "cliftinstr"], check=True)


with open(output_path, "w") as f:
    subprocess.run(["./cliftinstr", "-m", "llama3.2_1b.bin", "-n", "20", "-p", "The son applies for a credit card with an identity already stolen. He takes the card and spends almost all"], stdout=f, check=True,env=env)

file = output_path

GIT_COMMIT_HASH a949d894c60358eab2f04032e87af816e4a56262
gcc -O3 -ffast-math -fno-omit-frame-pointer -Wall -o clift clift.c -lm -DINSTR


clift.c: In function ‘transformer_forward’:
  536 |   double st = 0;
      |          ^~
clift.c: In function ‘transformer_forward_inlined’:
  802 | #pragma omp single
      | 
  814 | #pragma omp single
      | 
  839 | #pragma omp for collapse(2) nowait
      | 
In file included from clift.c:21:
  852 |     STOP_3(st, MATMUL_QKV, omp_get_thread_num(), past);
      |                            ^~~~~~~~~~~~~~~~~~
instrumentor.h:93:49: note: in definition of macro ‘STOP_3’
   93 | #define STOP_3(st, i1, i2, i3) tab_values_3[i1][i2][i3] += get_time() - st
      |                                                 ^~
  856 | #pragma omp for collapse(2) nowait
      | 
  872 | #pragma omp for collapse(2) nowait
      | 
  886 | #pragma omp barrier
      | 
  888 | #pragma omp for collapse(3) nowait
      | 
  904 | #pragma omp for collapse(3) nowait
      | 
  923 | #pragma omp for collapse(3) nowait
      | 
  970 | #pragma omp for collapse(3) nowait
      | 
  982 | #pragma omp barrier
    

CalledProcessError: Command '['make', 'cliftinstr']' returned non-zero exit status 2.

In [12]:
import json

def print_json_schema(data, indent=0):
    prefix = '  ' * indent
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{prefix}{key}: {type(value).__name__}")
            print_json_schema(value, indent + 1)
    elif isinstance(data, list):
        print(f"{prefix}- list of {len(data)} items")
        if data:
            print_json_schema(data[0], indent + 1)
    else:
        # leaf node
        pass

with open(file) as f:
    data = json.load(f)

print_json_schema(data)

JSONDecodeError: Extra data: line 48 column 1 (char 2675)

In [5]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

# On récupère le sous-dictionnaire JSON_1
json1 = data["JSON_1"]

# Création dynamique du DataFrame
df = pd.DataFrame(list(json1.items()), columns=["phase", "time (ms)"])

# Affichage en bar chart
fig = px.bar(
    df,
    x="phase",
    y="time (ms)",
    title="TIming of different sections",
    width=600
)
fig.show()


In [6]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

df = pd.DataFrame({
    "component": list(json2.keys()),
    "time (ms)": [values[0] if isinstance(values, list) and values else None for values in json2.values()]
})

df = df.dropna()

fig = px.bar(
    df,
    x="component",
    y="time (ms)",
    title="Component time for PROMPT PROCESSING",

    width=800,
    text_auto=".2f"
)
fig.show()


In [7]:
df_2 = pd.DataFrame({
    "component": list(json2.keys()),
    "time (ms)": [values[1] if isinstance(values, list) and values else None for values in json2.values()]
})
df = df.dropna()

fig = px.bar(
    df,
    x="component",
    y="time (ms)",
    title="Component time for CACHE LOADING",
    log_y=True,
    width=800,
    text_auto=".2f"
)
fig.show()

In [None]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

df = pd.DataFrame({
    "component": list(json2.keys()),
    "time (ms)": [
        sum(values[2:]) / len(values[2:]) if isinstance(values, list) and len(values) > 2 else None
        for values in json2.values()
    ]
})

df = df.dropna()

fig = px.bar(
    df,
    x="component",
    y="time (ms)",
    title="Average component time for TOKEN GENERATION",

    text_auto='.2f',
    width=800
)
fig.show()


In [9]:
total = sum(values[0] for values in json2.values() if isinstance(values, list) and values)
print(total)

4342.627600000001


In [10]:
import json
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

rows = []
for component, values in json2.items():
    if not isinstance(values, list):
        continue
    cleaned = [v for v in values[2:] if v != 0]
    for i, v in enumerate(cleaned):
        rows.append({
            "step": i,
            "time": v,
            "component": component
        })

df = pd.DataFrame(rows)

fig = px.line(
    df,
    x="step",
    y="time",
    color="component",
    markers=True,
    labels={"time": "time (ms)"},
    title="Per-step time per component during token generation"
)

fig.show()


In [11]:
import json
import numpy as np
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

forward = data["forward_instr"]
keys = [
    "rmsnorm_first_time",
    "matmul_qkv_time",
    "rope_time",
    "multihead_time",
    "matmul_output_attention_time",
    "FFN_rmsnorm_time",
    "matmul_FFN_time",
    "swiGLU_time",
    "matmul_output_FFN_time",
    "rmsnorm_final_time",
    "matmul_logits_time"
]

rows = []
for k in keys:
    values = [v for v in forward[k] if v != 0]
    rows += [(k, v) for v in values]

df = pd.DataFrame(rows, columns=["phase",  "time (in ms)"])

px.box(df, x="phase", y="time (in ms)",log_y = False,title = "Boxplot of difference phase per execution")


KeyError: 'forward_instr'

In [None]:
import json
import numpy as np
import pandas as pd
import plotly.express as px

with open(file) as f:
    data = json.load(f)

json2 = data["JSON_2"]

first_values = {
    k: v[0] for k, v in json2.items()
    if isinstance(v, list) and len(v) > 0
}
total_first = sum(first_values.values())
percent_first = {
    k: v / total_first * 100 for k, v in first_values.items()
}

df_first = pd.DataFrame({
    "component": list(percent_first.keys()),
    "percentage": list(percent_first.values())
}).sort_values("percentage", ascending=False)

fig1 = px.bar(
    df_first,
    x="component",
    y="percentage",
    title="Percentage of time for PROMPT_PROCESSING",
    text_auto=".2f"
)
fig1.show()

main_values = {
    k: sum(v[2:]) for k, v in json2.items()
    if isinstance(v, list) and len(v) > 2
}
total_main = sum(main_values.values())
percent_main = {
    k: v / total_main * 100 for k, v in main_values.items()
}

df_main = pd.DataFrame({
    "component": list(percent_main.keys()),
    "percentage": list(percent_main.values())
}).sort_values("percentage", ascending=False)

fig2 = px.bar(
    df_main,
    x="component",
    y="percentage",
    title="Percentage of time for TOKEN_GENERATION",
    text_auto=".2f"
)
fig2.show()


# 1. Multi thread comparison

## 2. ROPE Comparaison

In [None]:
file1 = "measurements/output_llama_3.bin_seq_20250617_165939.json"
file2 = "measurements/output_llama_3.bin_seq_20250618_174751.json"

In [None]:
import json
import pandas as pd
import numpy as np
import plotly.express as px


def extract_metrics(path):
    with open(path) as f:
        data = json.load(f)
        rope = data["JSON_2"]["ROPE"]
        return {"PROMPT_PROCESSING": rope[0], "TOKEN_GENERATION": np.mean(rope[2:])}


metrics1 = extract_metrics(file1)
metrics2 = extract_metrics(file2)

speedups = {"stage": [], "speedup (%)": [], "delta" : []}

for key in ["PROMPT_PROCESSING", "TOKEN_GENERATION"]:
    ratio = 100 * metrics1[key] / metrics2[key]
    speedups["stage"].append(key)
    speedups["speedup (%)"].append(ratio)
    speedups["delta"].append(metrics2[key] - metrics1[key])

df = pd.DataFrame(speedups)

fig = px.bar(df, x="stage", y="speedup (%)", title="ROPE speedup with pre-computed Rope", width=500)
fig.show()

In [None]:
df

Unnamed: 0,stage,speedup (%),delta
0,PROMPT_PROCESSING,749.676106,-3.5102
1,TOKEN_GENERATION,280.97827,-0.064743


## 2. Parralel Execution

In [19]:
import subprocess, os
import datetime
from pathlib import Path

commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
env_base = os.environ.copy()
env_base["GIT_COMMIT_HASH"] = commit_hash

Path("measurements").mkdir(exist_ok=True)
subprocess.run(["make", "ompinstr"], check=True)

thread_counts = [1, 2, 4, 8, 10, 12]
date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"measurements/output_concat_{date}.jsons"

prompt = "The son applies for a credit card with an identity already stolen. He takes the card and spends almost all"
model_path = "llama3.2_1b.bin"
prompt_file = prompt_file = [str(p) for p in Path("prompt").glob("file_*.txt")]
print(prompt_file)
with open(output_path, "w") as f:
    for file in prompt_file:
        for n_threads in thread_counts:
            proc = subprocess.run(
                ["./cliftinstr", "-m", model_path, "-n", "20", "-f", file,"-t", str(n_threads)],
                stdout=subprocess.PIPE,
                check=True,
                env=env_base
            )

            f.write(proc.stdout.decode().strip())
            f.write("\n---\n")
file = output_path


gcc -O3 -DINSTR -ffast-math -fno-omit-frame-pointer -Wall -fopenmp -march=native clift.c  -lm  -o cliftinstr


clift.c: In function ‘transformer_forward’:
  541 |   double st = 0;
      |          ^~
clift.c: In function ‘generate’:
 1763 |           char* piece = tokenizer_decode(tokenizer, current, next);
      |                 ^~~~~
 1793 |           char* piece = tokenizer_decode(tokenizer, current, next);
      |                 ^~~~~
clift.c: In function ‘main’:
 1963 |     fread(file_prompt, 1, fsize, pf);
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


['prompt/file_64.txt', 'prompt/file_450.txt', 'prompt/file_512.txt', 'prompt/file_128.txt', 'prompt/file_400.txt', 'prompt/file_256.txt', 'prompt/file_16.txt', 'prompt/file_32.txt', 'prompt/file_350.txt']


transformer_t configuration:
- embedding_dim:  2048
- hidden_dim:     8192
- layer_count:    16
- q_head_count:   32
- kv_head_count:  8
- vocabulary_len: 128256
- context_len:    2048
runtime setup:
- num_threads:    1
Sequence (119 tokens):
128000 2356 220 81101 11 10065 74522 2428 271 2356 220 81101 320 8921 1764 2840 3105 8 34388 49831 406 3869 326 529 541 265 951 220 68641 78 83649 83 2552 1880 3869 1208 67628 951 2211 102843 62306 5512 11 15890 3900 1826 514 5636 6800 2109 43711 12021 61512 294 529 32 102843 34395 3930 27430 9952 6316 62321 12844 8065 7515 4744 409 220 16 11 914 296 50026 5019 653 97418 23536 406 55398 91240 220 6330 21647 13 24703 83 10065 74522 2428 71925 6960 653 51625 53819 220 96893 978 5019 3625 28800 125800 409 1208 8474 64 220 75328 409 40852 82694 2249 2503 96559 68 9465 220 18 8 
[1;34mLe lama, animal domestique

Le lama (Lama glama) appartient à l’ordre des artiodactyles et à la famille des camélidés, dont il est le plus grand représentant d’Amérique 

In [1]:
file = "measurements/output_concat_20250624_132950.jsons"

In [13]:
import json



with open(file, "r") as f:
    content = f.read()

chunks = content.split("\n---\n")
parsed = [json.loads(chunk) for chunk in chunks if chunk.strip()]
print_json_schema(parsed[0])

commit: str
JSON_1_COMPT: dict
  PROMPT_LEN: float
  TOKEN_GENERATED: float
  NUM_THREADS: float
JSON_1: dict
  GENERATE_TIME: float
  PROMPT_PROCESSING_TIME: float
  TOKEN_GENERATION_TIME: float
JSON_2: dict
  RMSNORM_INIT: list
    - list of 21 items
  FFN_RMSNORM: list
    - list of 21 items
  FINAL_RMSNORM: list
    - list of 21 items
JSON_3: dict
  MATMUL_QKV: list
    - list of 1 items
      - list of 21 items
  ROPE: list
    - list of 1 items
      - list of 21 items
  ATTENTION_COMPUTATION: list
    - list of 1 items
      - list of 21 items
  MATMUL_OUTPUT_ATTENTION: list
    - list of 1 items
      - list of 21 items
  MATMUL_FFN: list
    - list of 1 items
      - list of 21 items
  SwiGLU: list
    - list of 1 items
      - list of 21 items
  MATMUL_OUTPUT_FFN: list
    - list of 1 items
      - list of 21 items
  MATMUL_LOGITS: list
    - list of 1 items
      - list of 21 items


In [7]:
print(file)

measurements/output_concat_20250625_105746.jsons


## Question : Does the speedup depends on the size of the prompt

In [49]:
import json, pandas as pd, plotly.express as px

def plot_speedup_prompt_processing(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    data = []
    for entry in parsed:
        prompt_len = int(entry["JSON_1_COMPT"]["PROMPT_LEN"])
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        time = entry["JSON_1"]["PROMPT_PROCESSING_TIME"]
        data.append({
            "Prompt length": prompt_len,
            "Threads": threads,
            "Time (ms)": time
        })

    df = pd.DataFrame(data)

    speedup_records = []
    for prompt_len, group in df.groupby("Prompt length"):
        group = group.sort_values("Threads")
        baseline = group.iloc[0]["Time (ms)"]
        for _, row in group.iterrows():
            speedup = baseline / row["Time (ms)"] if row["Time (ms)"] != 0 else None
            speedup_records.append({
                "Prompt length": prompt_len,
                "Threads": row["Threads"],
                "Speedup": speedup
            })

    speedup_df = pd.DataFrame(speedup_records)


    ideal_threads = sorted(speedup_df["Threads"].unique())
    ideal = pd.DataFrame({
        "Prompt length": ["Idéal"] * len(ideal_threads),
        "Threads": ideal_threads,
        "Speedup": ideal_threads
    })

    full_df = pd.concat([speedup_df, ideal], ignore_index=True)

    fig = px.line(
        full_df,
        x="Threads",
        y="Speedup",
        color="Prompt length",
        markers=True,
        title="Speedup (Prompt Processing Time) vs Threads (avec ligne idéale)"
    )
    fig.show()


In [50]:
plot_speedup_prompt_processing(file)

### Reponse : We experience different speed up, Seems that prompt larger than 150 shows better speed up ~20% difference 

## Question : Is the performance of the prompt processing (token/s handled) impacted by the size of the prompt

In [42]:
import json, pandas as pd, plotly.express as px

def tokens_line_chart(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    data = []
    for entry in parsed:
        prompt_len = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        tokens = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        generate_time = entry["JSON_1"]["PROMPT_PROCESSING_TIME"]  # en ms
        tokens_per_s = (tokens / generate_time) * 1e3 if generate_time > 0 else 0

        data.append({
            "Prompt length": prompt_len,
            "Threads": f"{threads} threads",
            "Tokens/s": tokens_per_s
        })

    data_sorted = sorted(data, key=lambda x: x["Prompt length"])
    df = pd.DataFrame(data_sorted)

    fig = px.line(
        df,
        x="Prompt length",
        y="Tokens/s",
        color="Threads",
        markers=True,
        title="Tokens/s (Prompt Processing) selon la taille du prompt et le nombre de threads"
    )
    fig.write_image(f"measurements/omp/tokens_line_chart.png")
    fig.show()

tokens_line_chart(file)

## Question : How are the performance of the phases are impacted by  the prompt size ? 

In [24]:
import json, pandas as pd, plotly.express as px

def line_tokens_per_thread(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    all_data = []
    for entry in parsed:
        prompt_len = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        tokens = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        for phase, matrix in entry["JSON_3"].items():
            if phase != "MATMUL_LOGITS":
                val = sum(matrix[i][1] for i in range(threads)) / threads
                tokens_per_s = (tokens / val) * 1e3 if val != 0 else 0
                all_data.append({
                    "Prompt length": prompt_len,
                    "Phase": phase,
                    "Threads": threads,
                    "Tokens/s": tokens_per_s
                })

    data_sorted = sorted(all_data, key=lambda x: x["Prompt length"])
    df = pd.DataFrame(data_sorted)

    # Calcul du gain en % par rapport à la plus petite longueur de prompt (par Threads et Phase)
    df["Gain (%)"] = 0.0
    for (threads, phase), group in df.groupby(["Threads", "Phase"]):
        group_sorted = group.sort_values("Prompt length")
        ref_val = group_sorted.iloc[0]["Tokens/s"]
        df.loc[group_sorted.index, "Gain (%)"] = (group_sorted["Tokens/s"])

    for thread_value in sorted(df["Threads"].unique()):
        df_thread = df[df["Threads"] == thread_value]
        fig = px.line(
            df_thread,
            x="Prompt length",
            y="Tokens/s",
            color="Phase",
            markers=True,

            title=f"Token/s vs prompt_size — {thread_value} thread(s)"
        )
        #fig.write_image(f"measurements/omp/gain_token_s_{thread_value}_thread.png")
        fig.show()

## La repartition des phases depend-t-elle de la taille du prompt
## Peut on voir les effets du remplissement du cache avec la taille du prompt ?
line_tokens_per_thread(file)
##df

### Answer: Independently from the number of threads every phase remain as performant (=  token/s handled) when the prompt is bigger 
**Exception** : Attention computation : that was expected because as the prompt goes larger, the context needed to produce the attention score becomes bigger  
Couldn't test it for bigger prompt as a seg fault is triggered

## Question : Can we see that at some point data is too big to fit in the cache ? 

In [30]:
import json
import pandas as pd
import plotly.express as px

def plot_phase_time_evolution(file):
    with open(file, "r") as f:
        content = f.read()

    chunks = content.split("\n---\n")
    parsed = [json.loads(chunk) for chunk in chunks if chunk.strip()]

    data = []
    for entry in parsed:
        prompt_len = int(entry["JSON_1_COMPT"]["PROMPT_LEN"])
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        for phase, matrix in entry["JSON_3"].items():
            if len(matrix) >= threads and all(len(m) > 1 for m in matrix[:threads]):
                mean_time = sum(matrix[i][1] for i in range(threads)) / threads
                data.append({
                    "Prompt length": prompt_len,
                    "Phase": phase,
                    "Threads": threads,
                    "Mean time (ms)": mean_time
                })

    data_sorted = sorted(data, key=lambda x: x["Prompt length"])
    df = pd.DataFrame(data_sorted)

    df["Prompt length"] = df["Prompt length"].astype(int)

    for thread_value in sorted(df["Threads"].unique()):
        df_thread = df[df["Threads"] == thread_value]
        fig = px.line(
            df_thread,
            x="Prompt length",
            y="Mean time (ms)",
            color="Phase",
            markers=True,
            title=f"Mean time (2ᵉ valeur) vs Prompt Length — {thread_value} thread(s)"
        )
        fig.show()
plot_phase_time_evolution(file)

### Reponse: No data is too small ? ->but seg fault when prompt is bigger 

## Question : Do all the phases benefit from the multithreading ? 

In [None]:
import json, pandas as pd, plotly.express as px

def Speed_up_prompt(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    grouped = {}
    for entry in parsed:
        prompt_len = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        for phase, matrix in entry["JSON_3"].items():
            val = sum(matrix[i][1] for i in range(threads)) / threads
            grouped.setdefault((phase, prompt_len), []).append((threads, val))

    records = []
    for (phase, prompt_len), values in grouped.items():
        values.sort()
        baseline = values[0][1]
        for t, time in values:
            speedup = baseline / time if time != 0 else None
            records.append({
                "Phase": phase,
                "Prompt length": prompt_len,
                "Threads": t,
                "Speedup": speedup
            })

    df = pd.DataFrame(records)

    for prompt_len in sorted(df["Prompt length"].unique()):
        sub_df = df[df["Prompt length"] == prompt_len].copy()

        ideal = pd.DataFrame({
            "Threads": sorted(sub_df["Threads"].unique()),
            "Speedup": sorted(sub_df["Threads"].unique()),
            "Phase": ["Ideal"] * len(sub_df["Threads"].unique())
        })

        full_df = pd.concat([sub_df, ideal], ignore_index=True)

        fig = px.line(
            full_df,
            x="Threads",
            y="Speedup",
            color="Phase",
            markers=True,
            title=f"Speedup vs Threads — Prompt Length {prompt_len}"
        )
        fig.write_image(f"measurements/omp/speedup_{prompt_len}.png")
        fig.show()


In [33]:
Speed_up_prompt(file)

## Answer : Not all the phases benefits the same from mt. Matmuls, are the one that scale the worse. We used a very naive version of matmul. With a better kernel, those will be fixed. No phase seems to be memory bound. as they don't seems do reach a ceilling. 

In [None]:
import json, pandas as pd, plotly.express as px

def tokens_line_chart(file):
    with open(file) as f:
        parsed = [json.loads(x) for x in f.read().split("\n---\n") if x.strip()]

    data = []
    for entry in parsed:
        prompt_len = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        threads = int(entry["JSON_1_COMPT"]["NUM_THREADS"])
        tokens = entry["JSON_1_COMPT"]["PROMPT_LEN"]
        generate_time = entry["JSON_1"]["PROMPT_PROCESSING_TIME"]  # en ms
        tokens_per_s = (tokens / generate_time) * 1e3 if generate_time > 0 else 0

        data.append({
            "Prompt length": prompt_len,
            "Threads": f"{threads} threads",
            "Tokens/s": tokens_per_s
        })

    data_sorted = sorted(data, key=lambda x: x["Prompt length"])
    df = pd.DataFrame(data_sorted)

    fig = px.line(
        df,
        x="Prompt length",
        y="Tokens/s",
        color="Threads",
        markers=True,
        title="Tokens/s (Prompt Processing) selon la taille du prompt et le nombre de threads"
    )
    fig.write_image(f"measurements/omp/tokens_line_chart.png")
    fig.show()

tokens_line_chart(file)