<a href="https://colab.research.google.com/github/2303A51553/HPC/blob/main/HPC_Lab_02_1553.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://docs.python.org/3/library/profile.html

In [None]:
import cProfile
import re
cProfile.run('re.compile("foo|bar")')

         233 function calls (226 primitive calls) in 0.000 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 __init__.py:226(compile)
        1    0.000    0.000    0.000    0.000 __init__.py:280(_compile)
        1    0.000    0.000    0.000    0.000 _compiler.py:216(_compile_charset)
        1    0.000    0.000    0.000    0.000 _compiler.py:243(_optimize_charset)
      3/1    0.000    0.000    0.000    0.000 _compiler.py:37(_compile)
        2    0.000    0.000    0.000    0.000 _compiler.py:433(_get_iscased)
        1    0.000    0.000    0.000    0.000 _compiler.py:441(_get_literal_prefix)
        1    0.000    0.000    0.000    0.000 _compiler.py:472(_get_charset_prefix)
        1    0.000    0.000    0.000    0.000 _compiler.py:516(_compile_info)
        2    0.000    0.000    0.000    0.000 _compiler.py:575

In [None]:
# section_a_vector_dot.py
import time, random, cProfile, pstats, io, tracemalloc, sys

def gen_data(n, seed=42):
  random.seed(seed)
  x = [random.random() for _ in range(n)]
  y = [random.random() for _ in range(n)]
  return x, y

def vector_add(x, y):
  out = []
  for a, b in zip(x, y):
    out.append(a + b)
  return out

def dot_product(x, y):
  s = 0.0
  for a, b in zip(x, y):
    s += a * b
  return s

def main():
  # Ensure tracemalloc is stopped from any previous runs before starting new profiling
  if tracemalloc.is_tracing():
      tracemalloc.stop()

  # Explicitly ensure sys.setprofile is None before attempting to enable cProfile
  # This is the most robust way to clear any lingering profiler hooks.
  sys.setprofile(None)

  # Ensure any previous cProfile or other sys.setprofile hooks are disabled
  # by creating a temporary profiler and immediately disabling it.
  # This makes the environment clean before enabling the current profiler.
  temp_pr = cProfile.Profile()
  temp_pr.disable() # This sets sys.setprofile(None)

  N = 2_000_000 # adjust based on your machine
  x, y = gen_data(N)

  # --- CPU Profiling with cProfile ---
  pr = cProfile.Profile()
  pr.enable()

  t0 = time.perf_counter()
  s = dot_product(x, y)
  t1 = time.perf_counter()

  t2 = time.perf_counter()
  z = vector_add(x, y)
  t3 = time.perf_counter()

  pr.disable()

  # --- Memory Profiling with tracemalloc ---
  # Since cProfile is now disabled, tracemalloc can run.
  tracemalloc.start()
  # Rerun operations to capture memory usage
  _ = dot_product(x, y) # Assign to _ as result is not needed, just memory
  _ = vector_add(x, y)
  current, peak = tracemalloc.get_traced_memory()
  tracemalloc.stop()

  # Print results
  print(f"Vector length N={N}")
  print(f"Dot product: {s:.6f} | Time: {t1 - t0:.3f} s")
  print(f"Vector add: len(z)={len(z)} | Time: {t3 - t2:.3f} s")
  print(f"Current/Peak memory: {current/1e6:.2f} MB / {peak/1e6:.2f} MB")

  # Profiler output
  s_buf = io.StringIO()
  ps = pstats.Stats(pr, stream=s_buf).sort_stats('cumtime')
  ps.print_stats(15)
  print("\n--- cProfile (Top 15 by cumulative time) ---")
  print(s_buf.getvalue())

# Call main() to execute the profiling when the cell is run
  main()


In [None]:
# section_a_vector_dot.py
import time
import random
import cProfile
import pstats
import io
import tracemalloc
import sys


def gen_data(n, seed=42):
    random.seed(seed)
    x = [random.random() for _ in range(n)]
    y = [random.random() for _ in range(n)]
    return x, y


def vector_add(x, y):
    out = []
    for a, b in zip(x, y):
        out.append(a + b)
    return out


def dot_product(x, y):
    s = 0.0
    for a, b in zip(x, y):
        s += a * b
    return s


def main():
    # ---- Ensure clean profiling environment ----
    if tracemalloc.is_tracing():
        tracemalloc.stop()

    sys.setprofile(None)

    N = 2_000_000  # reduce to 500_000 if system is slow
    x, y = gen_data(N)

    # ---- CPU Profiling ----
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    s = dot_product(x, y)
    t1 = time.perf_counter()

    t2 = time.perf_counter()
    z = vector_add(x, y)
    t3 = time.perf_counter()

    pr.disable()

    # ---- Memory Profiling ----
    tracemalloc.start()
    _ = dot_product(x, y)
    _ = vector_add(x, y)
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # ---- Results ----
    print(f"Vector length N = {N}")
    print(f"Dot product result: {s:.6f}")
    print(f"Dot product time: {t1 - t0:.3f} s")
    print(f"Vector add time: {t3 - t2:.3f} s")
    print(f"Current memory: {current/1e6:.2f} MB")
    print(f"Peak memory: {peak/1e6:.2f} MB")

    # ---- cProfile Output ----
    s_buf = io.StringIO()
    ps = pstats.Stats(pr, stream=s_buf).sort_stats("cumtime")
    ps.print_stats(10)

    print("\n--- cProfile (Top 10 by cumulative time) ---")
    print(s_buf.getvalue())


# ---- Proper entry point ----
if __name__ == "__main__":
    main()


Vector length N = 2000000
Dot product result: 499712.974120
Dot product time: 0.100 s
Vector add time: 0.784 s
Current memory: 65.13 MB
Peak memory: 65.13 MB

--- cProfile (Top 10 by cumulative time) ---
         2000009 function calls (2000008 primitive calls) in 0.884 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      2/1    0.483    0.242    0.599    0.599 /tmp/ipython-input-1096687492.py:18(vector_add)
  2000000    0.301    0.000    0.301    0.000 {method 'append' of 'list' objects}
        1    0.100    0.100    0.100    0.100 /tmp/ipython-input-1096687492.py:25(dot_product)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        4    0.000    0.000    0.000    0.000 {built-in method time.perf_counter}
        1    0.000    0.000    0.000    0.000 {built-in method posix.getppid}





In [None]:

# section_a_vector_dot.py
import time, random, cProfile, pstats, io, tracemalloc

def gen_data(n, seed=42):
    random.seed(seed)
    x = [random.random() for _ in range(n)]
    y = [random.random() for _ in range(n)]
    return x, y

def vector_add(x, y):
    out = []
    for a, b in zip(x, y):
        out.append(a + b)
    return out

def dot_product(x, y):
    s = 0.0
    for a, b in zip(x, y):
        s += a * b
    return s

def main():
    N = 2_000_000  # adjust based on your machine
    x, y = gen_data(N)

    tracemalloc.start()
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    s = dot_product(x, y)
    t1 = time.perf_counter()

    t2 = time.perf_counter()
    z = vector_add(x, y)
    t3 = time.perf_counter()

    pr.disable()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # Print results
    print(f"Vector length N={N}")
    print(f"Dot product: {s:.6f} | Time: {t1 - t0:.3f} s")
    print(f"Vector add: len(z)={len(z)} | Time: {t3 - t2:.3f} s")
    print(f"Current/Peak memory: {current/1e6:.2f} MB / {peak/1e6:.2f} MB")

    # Profiler output
    s_buf = io.StringIO()
    ps = pstats.Stats(pr, stream=s_buf).sort_stats('cumtime')
    ps.print_stats(15)
    print("\n--- cProfile (Top 15 by cumulative time) ---")
    print(s_buf.getvalue())

if __name__ == "__main__":
    main()

Vector length N=2000000
Dot product: 499712.974120 | Time: 0.733 s
Vector add: len(z)=2000000 | Time: 5.891 s
Current/Peak memory: 65.13 MB / 65.13 MB

--- cProfile (Top 15 by cumulative time) ---
         2000019 function calls (2000018 primitive calls) in 6.623 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    3.354    0.671    5.026    1.005 {built-in method time.sleep}
  2000000    1.965    0.000    1.965    0.000 {method 'append' of 'list' objects}
        1    0.733    0.733    0.733    0.733 /tmp/ipython-input-3909820823.py:16(dot_product)
      2/1    0.572    0.286    0.674    0.674 /tmp/ipython-input-3909820823.py:10(vector_add)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        6    0.000    0.000    0.000    0.000 {built-in method posix.getppid}
        4    0.000    0.000    0.000    0.000 {built-in method time.perf_counter}





In [None]:
# section_a_vector_dot.py
import time, random, cProfile, pstats, io, tracemalloc

def gen_data(n, seed=42):
    random.seed(seed)
    x = [random.random() for _ in range(n)]
    y = [random.random() for _ in range(n)]
    return x, y

def vector_add(x, y):
    out = []
    for a, b in zip(x, y):
        out.append(a + b)
    return out

def dot_product(x, y):
    s = 0.0
    for a, b in zip(x, y):
        s += a * b
    return s

def main():
    N = 2_000_000  # adjust based on your machine
    x, y = gen_data(N)

    tracemalloc.start()
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    s = dot_product(x, y)
    t1 = time.perf_counter()

    t2 = time.perf_counter()
    z = vector_add(x, y)
    t3 = time.perf_counter()

    pr.disable()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # Print results
    print(f"Vector length N={N}")
    print(f"Dot product: {s:.6f} | Time: {t1 - t0:.3f} s")
    print(f"Vector add: len(z)={len(z)} | Time: {t3 - t2:.3f} s")
    print(f"Current/Peak memory: {current/1e6:.2f} MB / {peak/1e6:.2f} MB")

    # Profiler output
    s_buf = io.StringIO()
    ps = pstats.Stats(pr, stream=s_buf).sort_stats('cumtime')
    ps.print_stats(15)
    print("\n--- cProfile (Top 15 by cumulative time) ---")
    print(s_buf.getvalue())

if __name__ == "__main__":
    main()

Vector length N=2000000
Dot product: 499712.974120 | Time: 0.839 s
Vector add: len(z)=2000000 | Time: 5.793 s
Current/Peak memory: 65.13 MB / 65.13 MB

--- cProfile (Top 15 by cumulative time) ---
         2000019 function calls (2000018 primitive calls) in 6.633 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    3.304    0.661    5.028    1.006 {built-in method time.sleep}
  2000000    1.990    0.000    1.990    0.000 {method 'append' of 'list' objects}
        1    0.839    0.839    0.839    0.839 /tmp/ipython-input-425613865.py:16(dot_product)
      2/1    0.499    0.249    0.653    0.653 /tmp/ipython-input-425613865.py:10(vector_add)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        6    0.000    0.000    0.000    0.000 {built-in method posix.getppid}
        4    0.000    0.000    0.000    0.000 {built-in method time.perf_counter}





In [None]:

# section_b_matmul.py
import time, random, cProfile, pstats, io, tracemalloc

def gen_matrix(n, seed=123):
    random.seed(seed)
    return [[random.random() for _ in range(n)] for _ in range(n)]

def matmul_naive(A, B):
    n = len(A)
    C = [[0.0]*n for _ in range(n)]
    for i in range(n):
        for k in range(n):
            aik = A[i][k]
            for j in range(n):
                C[i][j] += aik * B[k][j]
    return C

def main():
    n = 150  # raise/lower based on your machine; O(n^3)
    A = gen_matrix(n, seed=1)
    B = gen_matrix(n, seed=2)

    tracemalloc.start()
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    C = matmul_naive(A, B)
    t1 = time.perf_counter()

    pr.disable()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print(f"Matrix size n={n} -> {n}x{n}")
    print(f"Time: {t1 - t0:.3f} s | C[0][0]={C[0][0]:.6f}")
    print(f"Current/Peak memory: {current/1e6:.2f} MB / {peak/1e6:.2f} MB")

    s_buf = io.StringIO()
    ps = pstats.Stats(pr, stream=s_buf).sort_stats('cumtime')
    ps.print_stats(10)
    print("\n--- cProfile (Top 10 by cumulative time) ---")
    print(s_buf.getvalue())

if __name__ == "__main__":
    main()


Matrix size n=150 -> 150x150
Time: 1.993 s | C[0][0]=37.256203
Current/Peak memory: 0.75 MB / 0.75 MB

--- cProfile (Top 10 by cumulative time) ---
         467 function calls (454 primitive calls) in 1.998 seconds

   Ordered by: cumulative time
   List reduced from 95 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    1.005    1.005    1.005    1.005 {built-in method time.sleep}
        1    0.200    0.200    0.200    0.200 /tmp/ipython-input-2637978816.py:8(matmul_naive)
      3/2    0.000    0.000    0.018    0.009 {method 'run' of '_contextvars.Context' objects}
        2    0.000    0.000    0.018    0.009 /usr/local/lib/python3.12/dist-packages/zmq/eventloop/zmqstream.py:574(_handle_events)
        1    0.000    0.000    0.017    0.017 /usr/local/lib/python3.12/dist-packages/tornado/platform/asyncio.py:206(_handle_events)
        2    0.000    0.000    0.017    0.009 /usr/local/lib/python3.12/dist-packages/zmq/even

In [None]:
# section_c_conv2d.py
import time, cProfile, pstats, io, tracemalloc

def gen_grid(h, w):
    return [[(i*j) % 255 / 255.0 for j in range(w)] for i in range(h)]

def conv2d(grid, kernel):
    H, W = len(grid), len(grid[0])
    kh, kw = len(kernel), len(kernel[0])
    rh, rw = kh//2, kw//2
    out = [[0.0]*W for _ in range(H)]
    for i in range(rh, H-rh):
        for j in range(rw, W-rw):
            acc = 0.0
            for di in range(-rh, rh+1):
                for dj in range(-rw, rw+1):
                    acc += grid[i+di][j+dj] * kernel[di+rh][dj+rw]
            out[i][j] = acc
    return out

def make_uniform_kernel(size=5):
    val = 1.0 / (size*size)
    return [[val]*size for _ in range(size)]

def main():
    H, W = 256, 256  # adjust based on machine
    grid = gen_grid(H, W)
    kernel = make_uniform_kernel(5)

    tracemalloc.start()
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    out = conv2d(grid, kernel)
    t1 = time.perf_counter()

    pr.disable()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print(f"Grid: {H}x{W} | Kernel: 5x5 | Time: {t1 - t0:.3f} s")
    print(f"Sample out[128][128]={out[128][128]:.6f}")
    print(f"Current/Peak memory: {current/1e6:.2f} MB / {peak/1e6:.2f} MB")

    s_buf = io.StringIO()
    ps = pstats.Stats(pr, stream=s_buf).sort_stats('cumtime')
    ps.print_stats(10)
    print("\n--- cProfile (Top 10 by cumulative time) ---")
    print(s_buf.getvalue())

if __name__ == "__main__":
    main()

Grid: 256x256 | Kernel: 5x5 | Time: 3.810 s
Sample out[128][128]=0.490980
Current/Peak memory: 2.08 MB / 2.08 MB

--- cProfile (Top 10 by cumulative time) ---
         347 function calls (343 primitive calls) in 3.809 seconds

   Ordered by: cumulative time
   List reduced from 68 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    3.798    1.899 /usr/lib/python3.12/asyncio/base_events.py:1922(_run_once)
        2    0.000    0.000    3.798    1.899 /usr/lib/python3.12/selectors.py:451(select)
        2    0.040    0.020    3.798    1.899 {method 'poll' of 'select.epoll' objects}
        3    3.016    1.005    3.016    1.005 {built-in method time.sleep}
        1    0.742    0.742    0.742    0.742 /tmp/ipython-input-3230325479.py:7(conv2d)
       13    0.011    0.001    0.011    0.001 /usr/local/lib/python3.12/dist-packages/zmq/sugar/socket.py:632(send)
        1    0.000    0.000    0.006    0.006 /usr/

In [None]:
# section_d_montecarlo_pi.py
import time, random, cProfile, pstats, io, tracemalloc

def estimate_pi(n_samples, seed=2025):
    random.seed(seed)
    inside = 0
    for _ in range(n_samples):
        x = random.random()
        y = random.random()
        if x*x + y*y <= 1.0:
            inside += 1
    return 4.0 * inside / n_samples

def main():
    N = 2_000_000  # adjust for your machine
    tracemalloc.start()
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    pi_est = estimate_pi(N)
    t1 = time.perf_counter()

    pr.disable()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print(f"N={N} -> pi ≈ {pi_est:.6f} | Time: {t1 - t0:.3f} s")
    print(f"Current/Peak memory: {current/1e6:.2f} MB / {peak/1e6:.2f} MB")

    s_buf = io.StringIO()
    ps = pstats.Stats(pr, stream=s_buf).sort_stats('cumtime')
    ps.print_stats(10)
    print("\n--- cProfile (Top 10 by cumulative time) ---")
    print(s_buf.getvalue())

if __name__ == "__main__":
    main()

N=2000000 -> pi ≈ 3.142150 | Time: 5.957 s
Current/Peak memory: 0.00 MB / 0.00 MB

--- cProfile (Top 10 by cumulative time) ---
         4000020 function calls (4000019 primitive calls) in 5.957 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    4.289    0.858    5.030    1.006 {built-in method time.sleep}
  4000000    0.877    0.000    0.877    0.000 {method 'random' of '_random.Random' objects}
      2/1    0.791    0.395    0.730    0.730 /tmp/ipython-input-3913647630.py:4(estimate_pi)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.000    0.000    0.000    0.000 /usr/lib/python3.12/random.py:135(seed)
        6    0.000    0.000    0.000    0.000 {built-in method posix.getppid}
        1    0.000    0.000    0.000    0.000 {function Random.seed at 0x7922b1d1e980}
        2    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
 

In [None]:
# section_e_pairwise.py
import time, random, math, cProfile, pstats, io, tracemalloc

def gen_points(n, seed=7):
    random.seed(seed)
    return [(random.random(), random.random()) for _ in range(n)]

def pairwise_potential(points, eps=1e-6):
    n = len(points)
    pot = [0.0]*n
    for i in range(n):
        xi, yi = points[i]
        acc = 0.0
        for j in range(n):
            if i == j:
                continue
            xj, yj = points[j]
            dx, dy = xi - xj, yi - yj
            r = math.sqrt(dx*dx + dy*dy) + eps
            acc += 1.0 / r
        pot[i] = acc
    return pot

def main():
    N = 800  # adjust based on machine; O(N^2) interactions
    pts = gen_points(N)

    tracemalloc.start()
    pr = cProfile.Profile()
    pr.enable()

    t0 = time.perf_counter()
    pot = pairwise_potential(pts)
    t1 = time.perf_counter()

    pr.disable()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print(f"N={N} -> computed potentials | Time: {t1 - t0:.3f} s")
    print(f"Sample pot[0]={pot[0]:.6f}, pot[N//2]={pot[N//2]:.6f}")
    print(f"Current/Peak memory: {current/1e6:.2f} MB / {peak/1e6:.2f} MB")

    s_buf = io.StringIO()
    ps = pstats.Stats(pr, stream=s_buf).sort_stats('cumtime')
    ps.print_stats(10)
    print("\n--- cProfile (Top 10 by cumulative time) ---")
    print(s_buf.getvalue())

if __name__ == "__main__":
    main()

N=800 -> computed potentials | Time: 2.504 s
Sample pot[0]=2390.396335, pot[N//2]=2337.388513
Current/Peak memory: 0.05 MB / 0.05 MB

--- cProfile (Top 10 by cumulative time) ---
         639598 function calls (639588 primitive calls) in 2.503 seconds

   Ordered by: cumulative time
   List reduced from 103 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    1.705    0.853    2.010    1.005 {built-in method time.sleep}
   639200    0.380    0.000    0.380    0.000 {built-in method math.sqrt}
        1    0.083    0.083    0.098    0.098 /tmp/ipython-input-2231473572.py:8(pairwise_potential)
        1    0.000    0.000    0.006    0.006 /usr/local/lib/python3.12/dist-packages/ipykernel/iostream.py:219(<lambda>)
        1    0.000    0.000    0.006    0.006 /usr/local/lib/python3.12/dist-packages/ipykernel/iostream.py:221(_really_send)
        1    0.000    0.000    0.006    0.006 /usr/local/lib/python3.12/dist-packages/zmq/