In [7]:
# ===== COLAB BENCHMARK =====
import os, time, hashlib
import matplotlib.pyplot as plt
import pandas as pd

sizes_mb = [1, 10, 100, 500, 1024, 2048, 4096]
os.makedirs("colab_files", exist_ok=True)

for size in sizes_mb:
    fname = f"colab_files/file_{size}MB.bin"
    if not os.path.exists(fname):
        print(f"Creating {fname} ...")
        with open(fname, "wb") as f:
            f.write(os.urandom(size * 1024 * 1024))

CHUNK = 1024*1024  # 1 MB read blocks
LIST_CHUNK = 1024**3  # 1 GiB logical chunks

def naive_hash(filepath):
    h = hashlib.sha256()
    with open(filepath, "rb") as f:
        while block := f.read(CHUNK):
            h.update(block)
    return h.digest()

def list_based_hash(filepath):
    chunk_hashes, h, size = [], hashlib.sha256(), 0
    with open(filepath, "rb") as f:
        while block := f.read(CHUNK):
            h.update(block); size += len(block)
            if size >= LIST_CHUNK:
                chunk_hashes.append(h.digest()); h = hashlib.sha256(); size = 0
        if size > 0: chunk_hashes.append(h.digest())
    return hashlib.sha256(b"".join(chunk_hashes)).digest()

results_colab = {"size_gb": [], "naive_time": [], "list_time": []}
for size in sizes_mb:
    filepath = f"colab_files/file_{size}MB.bin"
    size_gb = size/1024

    t0 = time.time(); naive_hash(filepath); t1 = time.time()
    naive_t = t1-t0

    t0 = time.time(); list_based_hash(filepath); t1 = time.time()
    list_t = t1-t0

    results_colab["size_gb"].append(size_gb)
    results_colab["naive_time"].append(naive_t)
    results_colab["list_time"].append(list_t)
    print(f"Colab {size}MB: Naive {naive_t:.2f}s | List {list_t:.2f}s")

df_colab = pd.DataFrame(results_colab)
df_colab.to_csv("results_colab.csv", index=False)
df_colab


Creating colab_files/file_1MB.bin ...
Creating colab_files/file_10MB.bin ...
Creating colab_files/file_100MB.bin ...
Creating colab_files/file_500MB.bin ...
Creating colab_files/file_1024MB.bin ...
Creating colab_files/file_2048MB.bin ...
Creating colab_files/file_4096MB.bin ...
Colab 1MB: Naive 0.01s | List 0.00s
Colab 10MB: Naive 0.08s | List 0.04s
Colab 100MB: Naive 0.80s | List 0.37s
Colab 500MB: Naive 3.50s | List 1.79s
Colab 1024MB: Naive 7.66s | List 3.69s
Colab 2048MB: Naive 15.48s | List 7.93s
Colab 4096MB: Naive 30.56s | List 29.65s


Unnamed: 0,size_gb,naive_time,list_time
0,0.000977,0.01238,0.004184
1,0.009766,0.080072,0.035882
2,0.097656,0.798414,0.372331
3,0.488281,3.500555,1.7863
4,1.0,7.659326,3.68531
5,2.0,15.477072,7.934099
6,4.0,30.564488,29.654119
