In [None]:
import measure
import statistics
from tqdm import tqdm

In [None]:
code = r"""
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 100  // You can adjust this value to make the computation more intensive

void multiply_matrices(int **a, int **b, int **c, int n) {
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i][j] = 0;
            for (int k = 0; k < n; k++) {
                c[i][j] += a[i][k] * b[k][j];
            }
        }
    }
}

int main() {
    // Allocate and initialize matrices
    int **a = malloc(N * sizeof(int*));
    int **b = malloc(N * sizeof(int*));
    int **c = malloc(N * sizeof(int*));
    for (int i = 0; i < N; i++) {
        a[i] = malloc(N * sizeof(int));
        b[i] = malloc(N * sizeof(int));
        c[i] = malloc(N * sizeof(int));
    }

    // Initialize matrices with random values
    srand(time(NULL));
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            a[i][j] = rand() % 10;
            b[i][j] = rand() % 10;
        }
    }

    // Measure start time
    clock_t start = clock();

    // Perform matrix multiplication
    multiply_matrices(a, b, c, N);

    // Measure end time
    clock_t end = clock();

    // Calculate and print the elapsed time in seconds
    double elapsed_time = (double)(end - start) / CLOCKS_PER_SEC;
    printf("Time taken: %f seconds\n", elapsed_time);

    // Free allocated memory
    for (int i = 0; i < N; i++) {
        free(a[i]);
        free(b[i]);
        free(c[i]);
    }
    free(a);
    free(b);
    free(c);

    return 0;
}
"""

In [None]:
cid = measure.PerfMeasure().create(code=code, pass_sequence=[])

runtimes = list()
for i in tqdm(range(16)):
    runtimes += [measure.PerfMeasure().measure(cid)]
r0 = statistics.fmean(runtimes)
print(r0)

In [None]:
cid = measure.PerfMeasure().create(code=code, pass_sequence=["-O3"])

runtimes = list()
for i in tqdm(range(16)):
    runtimes += [measure.PerfMeasure().measure(cid)]
r1 = statistics.fmean(runtimes)
print(r1)

In [None]:
cid = measure.PerfMeasure().create(code=code, pass_sequence=['-LoopRotatePass', '-SROAPass', '-AggressiveDCEPass', '-FunctionInliningPass', '-LoopDeletionPass', '-IndVarSimplifyPass', '-LoopUnswitchPass', '-DeadStoreEliminationPass', '-SCCPPass', '-InstCombinePass', '-GVNPass', '-LoopUnrollPass', '-GlobalDCEPass', '-PromoteMemoryToRegisterPass', '-ReassociatePass', '-TailCallEliminationPass'])

runtimes = list()
for i in tqdm(range(16)):
    runtimes += [measure.PerfMeasure().measure(cid)]
r2 = statistics.fmean(runtimes)
print(r2)

In [None]:
cid = measure.PerfMeasure().create(code=code, pass_sequence=['-lower-widenable-condition', '-lower-constant-intrinsics', '-flatten-multicfg', '-callsite-splitting', '-nee-weave', '-add-discriminators', '-embed-bitcode', '-gvn-sink', '-simple-loop-unswitch'])
runtimes = list()
for i in tqdm(range(16)):
    runtimes += [measure.PerfMeasure().measure(cid)]
r3 = statistics.fmean(runtimes)
print(r3)

In [None]:
print(r0, r1, r2, r3)

In [None]:
r0 / r1

In [None]:
r0 / r2

In [None]:
from datasets import load_dataset

# specific language (e.g. Dockerfiles)
ds = load_dataset("bigcode/the-stack-v2-dedup", "C++", split="train")

In [None]:
import os
import boto3
from smart_open import open
from datasets import load_dataset

session = boto3.Session(
    aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"])
s3 = session.client("s3")

def download_contents(blob_id, src_encoding):
    s3_url = f"s3://softwareheritage/content/{blob_id}"
    
    with open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin:
        content = fin.read().decode(src_encoding)
    
    return {"content": content}

ds = load_dataset("bigcode/the-stack-v2-dedup", split="train", streaming=True)
ds = ds.map(lambda row: download_contents(row["blob_id"], row["src_encoding"]))
for row in ds:
    print(row["content"])
    break


In [None]:
from  datasets  import  load_dataset

ds = load_dataset("bigcode/the-stack", data_dir="data/c", streaming=True, split="train")
i = 0

for sample in iter(ds): 
    print(sample["content"])
    i += 1
    if i > 10:
        break

    