<a href="https://colab.research.google.com/github/04pys/cs-systems-labs/blob/main/notebooks/phase01_lowlevel_basics/lab04_prefetch_stride.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
%%bash
cat > main.cpp << 'CPP'

#include <bits/stdc++.h>
using namespace std;

static inline long long now_ns() {
  return chrono::duration_cast<chrono::nanoseconds>(
    chrono::steady_clock::now().time_since_epoch()
  ).count();
}

static inline size_t idx(size_t i, size_t n) { return i % n; }

long long bench_stride(const vector<int>& a, size_t stride, bool use_prefetch, size_t dist) {
  volatile long long sum = 0;
  const size_t n = a.size();
  long long t0 = now_ns();

  for (size_t off = 0; off < stride; ++off) {
    // off로 시작해서 s씩 점프하면 i ≡ off (mod stride)인 원소만 방문함
    for (size_t i = off; i < n; i += stride) {
      if (use_prefetch) {
        size_t ni = i + dist * stride;
        if (ni < n) __builtin_prefetch(&a[ni], 0, 1);
      }
      sum += a[i];
    }
  }

  long long t1 = now_ns();
  return (t1 - t0);
}

int main() {
  ios::sync_with_stdio(false);
  cin.tie(nullptr);

  size_t n = 1ull << 26; // 67,108,864 ints ~= 256MB, 캐시 밖으로 크게
  vector<int> a(n);
  for (size_t i = 0; i < n; ++i) a[i] = int(i * 1315423911u);

  vector<size_t> strides = {1,2,4,8,16,32,64,128,256,512,1024,2048};
  vector<size_t> dists   = {1,2,3,4,6,8,11,15,16,18,22,24,32,64};

  // warmup
  (void)bench_stride(a, 1, false, 0);

  for (size_t s : strides) {
    long long base = bench_stride(a, s, false, 0);
    cout << "stride=" << s << " base_ms=" << base / 1e+6;

    for (size_t d : dists) {
      long long pf = bench_stride(a, s, true, d);
      cout << " pf(d=" << d << ")_ms=" << pf / 1e+6;
    }
    cout << "\n";
  }
}


CPP

In [5]:
%%bash
cat > test.cpp << 'CPP'
#include <bits/stdc++.h>
using namespace std;

#if defined(__x86_64__) || defined(__i386__)
#include <x86intrin.h>
static inline uint64_t rdtsc_begin() {
  unsigned lo, hi;
  asm volatile("lfence\nrdtsc" : "=a"(lo), "=d"(hi) :: "memory");
  return (uint64_t(hi) << 32) | lo;
}
static inline uint64_t rdtsc_end() {
  unsigned lo, hi;
  asm volatile("rdtscp\nlfence" : "=a"(lo), "=d"(hi) :: "rcx", "memory");
  return (uint64_t(hi) << 32) | lo;
}
#else
#error "RDTSC not supported on this architecture"
#endif

static uint64_t median_u64(vector<uint64_t>& v) {
  nth_element(v.begin(), v.begin() + v.size()/2, v.end());
  return v[v.size()/2];
}

// 캐시 흔들림 완화용 간단 thrash용 버퍼임
static inline void thrash_cache(vector<uint8_t>& buf) {
  volatile uint64_t s = 0;
  for (size_t i = 0; i < buf.size(); i += 64) s += buf[i];
  (void)s;
}

// stride 접근 순서와 동일한 order를 메모리에 저장하는 next 테이블 생성임
// order는 off=0..s-1, i=off; i<n; i+=s 순서임
// next[i] = 다음으로 갈 인덱스임
static inline uint32_t succ_index(size_t i, size_t n, size_t s) {
  size_t off = i % s;
  size_t ni = i + s;
  if (ni < n) return (uint32_t)ni;
  // 해당 residue 마지막이면 다음 residue의 첫 원소로 이동임
  size_t off2 = off + 1;
  if (off2 < s) return (uint32_t)off2;
  return 0u;
}

static void build_next_in_place(vector<uint32_t>& next, size_t s) {
  const size_t n = next.size();
  for (size_t i = 0; i < n; ++i) next[i] = succ_index(i, n, s);
}

// pointer-chase 레이턴시 측정임
// steps번 의존적 로드를 수행하고 cycles/step을 반환함
static double measure_pointer_chase_cycles_per_step(
    const vector<uint32_t>& next,
    size_t steps,
    int trials,
    vector<uint8_t>* thrash_buf = nullptr) {

  const size_t n = next.size();
  if (steps > n) steps = n;

  vector<uint64_t> cyc;
  cyc.reserve(trials);

  for (int t = 0; t < trials; ++t) {
    if (thrash_buf) thrash_cache(*thrash_buf);

    volatile uint32_t cur = 0;
    volatile uint64_t sink = 0;

    uint64_t t0 = rdtsc_begin();
    for (size_t k = 0; k < steps; ++k) {
      cur = next[cur];     // 의존적 로드 발생함
      sink += cur;         // 제거 방지용 관측값임
    }
    uint64_t t1 = rdtsc_end();
    (void)sink;
    cyc.push_back(t1 - t0);
  }

  uint64_t med = median_u64(cyc);
  return (double)med / (double)steps;
}

// hot에서 stride 루프의 cycles/access를 재기 위한 벤치임
// reps를 키워 총 접근 횟수를 늘릴 수 있게 함
static uint64_t bench_stride_cycles_hot(
    const vector<int>& a,
    size_t stride,
    bool use_prefetch,
    size_t dist,
    size_t reps) {

  volatile long long sum = 0;
  const size_t n = a.size();

  uint64_t t0 = rdtsc_begin();

  for (size_t rep = 0; rep < reps; ++rep) {
    for (size_t off = 0; off < stride; ++off) {
      for (size_t i = off; i < n; i += stride) {
        if (use_prefetch) {
          size_t ni = i + dist * stride;
          if (ni < n) __builtin_prefetch(&a[ni], 0, 1);
        }
        sum += a[i];
      }
    }
  }

  uint64_t t1 = rdtsc_end();
  (void)sum;
  return t1 - t0;
}

static double bench_hot_cycles_per_access_median(
    const vector<int>& hot,
    size_t stride,
    bool use_prefetch,
    size_t dist,
    size_t reps,
    int trials,
    vector<uint8_t>* thrash_buf = nullptr) {

  vector<uint64_t> v;
  v.reserve(trials);
  for (int t = 0; t < trials; ++t) {
    if (thrash_buf) thrash_cache(*thrash_buf);
    v.push_back(bench_stride_cycles_hot(hot, stride, use_prefetch, dist, reps));
  }
  uint64_t med = median_u64(v);
  double total_access = (double)reps * (double)hot.size();
  return (double)med / total_access;
}

// dist 후보 생성은 기존 구조를 유지하되 pred 기반임
static vector<size_t> make_candidates(size_t pred) {
  vector<size_t> c;

  auto push = [&](size_t x) {
    if (x < 1) x = 1;
    if (x > 4096) x = 4096;
    c.push_back(x);
  };

  push(pred / 2);
  push((pred * 3) / 4);
  push(pred);
  push((pred * 5) / 4);
  push((pred * 3) / 2);
  push(pred * 2);

  auto lower_pow2 = [&](size_t x) {
    size_t p = 1;
    while ((p << 1) <= x) p <<= 1;
    return p;
  };
  auto upper_pow2 = [&](size_t x) {
    size_t p = 1;
    while (p < x) p <<= 1;
    return p;
  };
  push(lower_pow2(max<size_t>(1, pred)));
  push(upper_pow2(max<size_t>(1, pred)));

  sort(c.begin(), c.end());
  c.erase(unique(c.begin(), c.end()), c.end());

  if (c.size() > 10) {
    sort(c.begin(), c.end(), [&](size_t a, size_t b) {
      auto da = (a > pred) ? (a - pred) : (pred - a);
      auto db = (b > pred) ? (b - pred) : (pred - b);
      if (da != db) return da < db;
      return a < b;
    });
    c.resize(10);
    sort(c.begin(), c.end());
  }
  return c;
}

int main() {
  ios::sync_with_stdio(false);
  cin.tie(nullptr);

  const size_t n_cold = 1ull << 26; // 256MB 영역임
  vector<uint32_t> next(n_cold);

  const size_t n_hot = 1ull << 13;  // 32KB 근처임
  vector<int> hot(n_hot);
  for (size_t i = 0; i < n_hot; ++i) hot[i] = int(i * 2654435761u);

  vector<size_t> strides = {1,2,4,8,16,32,64,128,256,512,1024,2048};

  // thrash 버퍼는 L3 일부를 흔들기 위한 용도임
  // 너무 크게 잡으면 오히려 시간이 늘 수 있어 64MB 정도로 둠
  vector<uint8_t> thr(64ull << 20, 1);

  // pointer chase steps는 너무 크면 오래 걸리니 일부만 사용함
  // prefix만 돌려도 chain이 큰 메모리 범위를 순회하므로 cold 성격 유지 가능함
  const size_t CHASE_STEPS = 1ull << 22; // 4M steps 임
  const int CHASE_TRIALS = 5;
  const int HOT_TRIALS = 7;

  cout << fixed << setprecision(3);

  // warmup
  thrash_cache(thr);
  (void)bench_hot_cycles_per_access_median(hot, 1, false, 0, 256, 3, &thr);

  for (size_t s : strides) {
    // stride별 next 테이블을 만들어 pointer-chase 레이턴시를 직접 측정함
    build_next_in_place(next, s);

    // 1) L_lat_cycles_per_step 측정임
    double L_lat_cyc = measure_pointer_chase_cycles_per_step(next, CHASE_STEPS, CHASE_TRIALS, &thr);

    // 2) T_iter_cycles_per_access 측정임
    // hot에서 stride 루프의 평균 cycles/access를 측정함
    // reps를 늘려 충분히 긴 구간을 재서 노이즈를 줄임
    size_t reps_hot = 2048; // hot.size()*reps_hot 접근횟수임
    double T_iter_cyc = bench_hot_cycles_per_access_median(hot, s, false, 0, reps_hot, HOT_TRIALS, &thr);

    // 3) dist_pred 계산임
    // dist * T_iter >= L_lat 형태임
    size_t dist_pred = 1;
    if (T_iter_cyc > 0.0) {
      dist_pred = (size_t)ceil(L_lat_cyc / T_iter_cyc);
      if (dist_pred < 1) dist_pred = 1;
      if (dist_pred > 4096) dist_pred = 4096;
    }

    // dist 후보는 pred 기반으로만 생성함
    vector<size_t> cand = make_candidates(dist_pred);

    // cold benchmark는 원래 코드처럼 sum 기반 스트리밍이며, cycle로 측정함
    // next 배열은 uint32_t라 (?) hot과 타입이 달라 bench 함수 재사용이 어려워 별도 배열을 준비함
    // 여기서는 cold 스트리밍을 next 자체를 int로 view 해서 수행함
    // 값의 의미는 중요하지 않고 주소 접근이 중요함
    const int* cold_view = reinterpret_cast<const int*>(next.data());
    size_t cold_n_int = next.size(); // uint32_t == int 가정 환경임

    auto bench_cold_cycles = [&](bool use_pf, size_t dist) -> uint64_t {
      volatile long long sum = 0;
      uint64_t t0 = rdtsc_begin();
      for (size_t off = 0; off < s; ++off) {
        for (size_t i = off; i < cold_n_int; i += s) {
          if (use_pf) {
            size_t ni = i + dist * s;
            if (ni < cold_n_int) __builtin_prefetch(&cold_view[ni], 0, 1);
          }
          sum += cold_view[i];
        }
      }
      uint64_t t1 = rdtsc_end();
      (void)sum;
      return t1 - t0;
    };

    // base와 후보들 비교임
    vector<uint64_t> base_trials;
    base_trials.reserve(7);
    for (int t = 0; t < 7; ++t) {
      thrash_cache(thr);
      base_trials.push_back(bench_cold_cycles(false, 0));
    }
    uint64_t base_cyc = median_u64(base_trials);
    double base_cyc_per_access = (double)base_cyc / (double)cold_n_int;

    uint64_t best_cyc = base_cyc;
    size_t best_dist = (size_t)-1;
    double best_cyc_per_access = base_cyc_per_access;

    for (size_t d : cand) {
      vector<uint64_t> trials;
      trials.reserve(7);
      for (int t = 0; t < 7; ++t) {
        thrash_cache(thr);
        trials.push_back(bench_cold_cycles(true, d));
      }
      uint64_t med = median_u64(trials);
      double cyc_per_access = (double)med / (double)cold_n_int;
      if (med < best_cyc) {
        best_cyc = med;
        best_dist = d;
        best_cyc_per_access = cyc_per_access;
      }
    }

    double improve = (double)(base_cyc - best_cyc) / (double)base_cyc * 100.0;

    cout << "stride=" << s
         << " L_lat_cyc=" << L_lat_cyc
         << " T_iter_cyc=" << T_iter_cyc
         << " dist_pred=" << dist_pred
         << " candidates=[";
    for (size_t i = 0; i < cand.size(); ++i) cout << cand[i] << (i+1==cand.size()? "" : ",");
    cout << "]"
         << " base_cyc_per_access=" << base_cyc_per_access
         << " best_dist=";
    if (best_dist == (size_t)-1) cout << "none";
    else cout << best_dist;
    cout << " best_cyc_per_access=" << best_cyc_per_access
         << " improvement=" << improve << "%\n";
  }

  return 0;
}



CPP

In [9]:
!g++ -O2 -std=c++17 main.cpp -o main

In [6]:
!g++ -O2 -std=c++17 test.cpp -o test

In [None]:
!./main

stride=1 base_ms=159.572 pf(d=8)_ms=165.931 pf(d=16)_ms=166.412 pf(d=32)_ms=173.243 pf(d=64)_ms=179.864
stride=2 base_ms=99.1241 pf(d=8)_ms=88.3046 pf(d=16)_ms=90.9427 pf(d=32)_ms=83.6366 pf(d=64)_ms=84.6208
stride=4 base_ms=49.512 pf(d=8)_ms=47.0457 pf(d=16)_ms=48.7618 pf(d=32)_ms=44.2399 pf(d=64)_ms=43.1333
stride=8 base_ms=34.2654 pf(d=8)_ms=31.0601 pf(d=16)_ms=31.5043 pf(d=32)_ms=34.2611 pf(d=64)_ms=27.3765
stride=16 base_ms=31.2899 pf(d=8)_ms=30.0586 pf(d=16)_ms=29.5048 pf(d=32)_ms=27.9705 pf(d=64)_ms=30.6197
stride=32 base_ms=26.4143 pf(d=8)_ms=24.6973 pf(d=16)_ms=25.5165 pf(d=32)_ms=25.6141 pf(d=64)_ms=28.5957
stride=64 base_ms=19.602 pf(d=8)_ms=18.584 pf(d=16)_ms=13.7295 pf(d=32)_ms=14.8101 pf(d=64)_ms=14.3161
stride=128 base_ms=9.10689 pf(d=8)_ms=7.88027 pf(d=16)_ms=7.24489 pf(d=32)_ms=7.41617 pf(d=64)_ms=6.16846
stride=256 base_ms=3.86432 pf(d=8)_ms=3.14948 pf(d=16)_ms=2.87053 pf(d=32)_ms=3.06814 pf(d=64)_ms=3.04295
stride=512 base_ms=2.38526 pf(d=8)_ms=2.87528 pf(d=16)_ms=2.

In [None]:
!./main

stride=1 base_ms=145.727 pf(d=8)_ms=130.487 pf(d=16)_ms=129.666 pf(d=32)_ms=133.638 pf(d=64)_ms=134.621
stride=2 base_ms=153.482 pf(d=8)_ms=137.947 pf(d=16)_ms=136.252 pf(d=32)_ms=133.779 pf(d=64)_ms=133.165
stride=4 base_ms=176.601 pf(d=8)_ms=172.492 pf(d=16)_ms=158.773 pf(d=32)_ms=150.514 pf(d=64)_ms=144.199
stride=8 base_ms=230.179 pf(d=8)_ms=229.564 pf(d=16)_ms=222.029 pf(d=32)_ms=215.593 pf(d=64)_ms=199.784
stride=16 base_ms=427.614 pf(d=8)_ms=425.515 pf(d=16)_ms=412.031 pf(d=32)_ms=394.564 pf(d=64)_ms=406.247
stride=32 base_ms=877.067 pf(d=8)_ms=780.409 pf(d=16)_ms=800.41 pf(d=32)_ms=856.033 pf(d=64)_ms=712.267
stride=64 base_ms=890.534 pf(d=8)_ms=812.861 pf(d=16)_ms=768.21 pf(d=32)_ms=749.671 pf(d=64)_ms=745.485
stride=128 base_ms=751.55 pf(d=8)_ms=677.623 pf(d=16)_ms=687.638 pf(d=32)_ms=779.819 pf(d=64)_ms=793.461
stride=256 base_ms=761.339 pf(d=8)_ms=669.808 pf(d=16)_ms=769.151 pf(d=32)_ms=730.64 pf(d=64)_ms=733.342
stride=512 base_ms=891.399 pf(d=8)_ms=720.975 pf(d=16)_ms=686

In [None]:
!./main

stride=1 base_ms=151.768 pf(d=8)_ms=130.576 pf(d=16)_ms=131.703 pf(d=32)_ms=137.306 pf(d=64)_ms=128.781
stride=2 base_ms=156.014 pf(d=8)_ms=139.828 pf(d=16)_ms=142.738 pf(d=32)_ms=135.345 pf(d=64)_ms=136.201
stride=4 base_ms=178.059 pf(d=8)_ms=156.735 pf(d=16)_ms=156.974 pf(d=32)_ms=151.378 pf(d=64)_ms=155.559
stride=8 base_ms=227.269 pf(d=8)_ms=215.527 pf(d=16)_ms=213.801 pf(d=32)_ms=214.375 pf(d=64)_ms=207.218
stride=16 base_ms=413.581 pf(d=8)_ms=402.56 pf(d=16)_ms=393.6 pf(d=32)_ms=391.219 pf(d=64)_ms=394.657
stride=32 base_ms=710.853 pf(d=8)_ms=698.67 pf(d=16)_ms=720.512 pf(d=32)_ms=741.049 pf(d=64)_ms=725.804
stride=64 base_ms=1089.52 pf(d=8)_ms=968.443 pf(d=16)_ms=825.709 pf(d=32)_ms=801.701 pf(d=64)_ms=791.467
stride=128 base_ms=743.806 pf(d=8)_ms=641.27 pf(d=16)_ms=672.286 pf(d=32)_ms=733.96 pf(d=64)_ms=751.613
stride=256 base_ms=727.369 pf(d=8)_ms=711.539 pf(d=16)_ms=653.52 pf(d=32)_ms=671.346 pf(d=64)_ms=710.749
stride=512 base_ms=920.638 pf(d=8)_ms=793.105 pf(d=16)_ms=790.64

In [None]:
!./main

stride=1 base_ms=154.112 pf(d=8)_ms=168.854 pf(d=16)_ms=186.588 pf(d=32)_ms=167.054 pf(d=64)_ms=176.415
stride=2 base_ms=175.237 pf(d=8)_ms=165.537 pf(d=16)_ms=168.567 pf(d=32)_ms=140.309 pf(d=64)_ms=134.728
stride=4 base_ms=178.883 pf(d=8)_ms=166.621 pf(d=16)_ms=157.727 pf(d=32)_ms=154.253 pf(d=64)_ms=144.005
stride=8 base_ms=244.439 pf(d=8)_ms=237.362 pf(d=16)_ms=234.444 pf(d=32)_ms=238.8 pf(d=64)_ms=218.405
stride=16 base_ms=437.771 pf(d=8)_ms=421.283 pf(d=16)_ms=447.774 pf(d=32)_ms=425.762 pf(d=64)_ms=453.624
stride=32 base_ms=826.883 pf(d=8)_ms=801.607 pf(d=16)_ms=714.234 pf(d=32)_ms=772.808 pf(d=64)_ms=743.542
stride=64 base_ms=965.297 pf(d=8)_ms=986.852 pf(d=16)_ms=951.303 pf(d=32)_ms=1077.65 pf(d=64)_ms=1253.6
stride=128 base_ms=968.022 pf(d=8)_ms=887.343 pf(d=16)_ms=899.71 pf(d=32)_ms=950.22 pf(d=64)_ms=907.748
stride=256 base_ms=857.604 pf(d=8)_ms=870.254 pf(d=16)_ms=951.207 pf(d=32)_ms=967.343 pf(d=64)_ms=915.116
stride=512 base_ms=913.397 pf(d=8)_ms=959.857 pf(d=16)_ms=885.

In [None]:
# dist는 공간 locality가 아닌 시간 locality를 관리할 수 있도록 해준다.
# 이 내용을 보고서에 쓸 것.

In [None]:
!./test

stride=1 base_ms=218.778 base_ns_per_access=3.260 hot_ms=232.460 hot_ns_per_access=3.464 L_eff_est=0.000 dist_pred=1 candidates=[1,2] best_dist=0 best_ms=218.778 best_ns_per_access=3.260 improvement=0.000%
stride=2 base_ms=228.598 base_ns_per_access=3.406 hot_ms=228.881 hot_ns_per_access=3.411 L_eff_est=0.000 dist_pred=1 candidates=[1,2] best_dist=0 best_ms=228.598 best_ns_per_access=3.406 improvement=0.000%
stride=4 base_ms=267.737 base_ns_per_access=3.990 hot_ms=230.726 hot_ns_per_access=3.438 L_eff_est=0.552 dist_pred=1 candidates=[1,2] best_dist=1 best_ms=227.333 best_ns_per_access=3.388 improvement=15.091%
stride=8 base_ms=256.308 base_ns_per_access=3.819 hot_ms=233.279 hot_ns_per_access=3.476 L_eff_est=0.343 dist_pred=1 candidates=[1,2] best_dist=2 best_ms=253.171 best_ns_per_access=3.773 improvement=1.224%
stride=16 base_ms=400.927 base_ns_per_access=5.974 hot_ms=241.039 hot_ns_per_access=3.592 L_eff_est=2.383 dist_pred=1 candidates=[1,2] best_dist=0 best_ms=400.927 best_ns_per_

In [4]:
!./test

stride=1 base_ms=244.982 base_ns_per_access=3.651 hot_ms=243.023 hot_ns_per_access=3.621 L_eff_est=0.029 dist_pred=1 candidates=[1,2] best_dist=2 best_ms=243.454 best_ns_per_access=3.628 improvement=0.624%
stride=2 base_ms=228.512 base_ns_per_access=3.405 hot_ms=236.566 hot_ns_per_access=3.525 L_eff_est=0.000 dist_pred=1 candidates=[1,2] best_dist=0 best_ms=228.512 best_ns_per_access=3.405 improvement=0.000%
stride=4 base_ms=232.873 base_ns_per_access=3.470 hot_ms=239.737 hot_ns_per_access=3.572 L_eff_est=0.000 dist_pred=1 candidates=[1,2] best_dist=1 best_ms=231.251 best_ns_per_access=3.446 improvement=0.696%
stride=8 base_ms=306.488 base_ns_per_access=4.567 hot_ms=240.164 hot_ns_per_access=3.579 L_eff_est=0.988 dist_pred=1 candidates=[1,2] best_dist=1 best_ms=255.152 best_ns_per_access=3.802 improvement=16.750%
stride=16 base_ms=414.036 base_ns_per_access=6.170 hot_ms=244.353 hot_ns_per_access=3.641 L_eff_est=2.528 dist_pred=1 candidates=[1,2] best_dist=1 best_ms=401.246 best_ns_per_

In [7]:
!./test

stride=1 L_lat_cyc=13.511 T_iter_cyc=7.068 dist_pred=2 candidates=[1,2,3,4] base_cyc_per_access=7.982 best_dist=1 best_cyc_per_access=7.487 improvement=6.208%
stride=2 L_lat_cyc=13.623 T_iter_cyc=7.678 dist_pred=2 candidates=[1,2,3,4] base_cyc_per_access=7.550 best_dist=4 best_cyc_per_access=7.187 improvement=4.809%
stride=4 L_lat_cyc=14.182 T_iter_cyc=7.523 dist_pred=2 candidates=[1,2,3,4] base_cyc_per_access=8.431 best_dist=3 best_cyc_per_access=7.246 improvement=14.049%
stride=8 L_lat_cyc=15.438 T_iter_cyc=7.587 dist_pred=3 candidates=[1,2,3,4,6] base_cyc_per_access=8.336 best_dist=6 best_cyc_per_access=7.786 improvement=6.596%
stride=16 L_lat_cyc=24.420 T_iter_cyc=7.752 dist_pred=4 candidates=[2,3,4,5,6,8] base_cyc_per_access=13.013 best_dist=4 best_cyc_per_access=12.723 improvement=2.225%
stride=32 L_lat_cyc=56.223 T_iter_cyc=8.164 dist_pred=7 candidates=[3,4,5,7,8,10,14] base_cyc_per_access=22.779 best_dist=none best_cyc_per_access=22.779 improvement=0.000%
stride=64 L_lat_cyc=12

In [10]:
!./main

stride=1 base_ms=234.71 pf(d=1)_ms=246.161 pf(d=2)_ms=264.619 pf(d=3)_ms=249.542 pf(d=4)_ms=250.561 pf(d=6)_ms=261.304 pf(d=8)_ms=255.109 pf(d=11)_ms=248.351 pf(d=15)_ms=246.773 pf(d=16)_ms=247.093 pf(d=18)_ms=255.372 pf(d=22)_ms=258.442 pf(d=24)_ms=253.75 pf(d=32)_ms=254.162 pf(d=64)_ms=269.263
stride=2 base_ms=234.954 pf(d=1)_ms=228.115 pf(d=2)_ms=227.907 pf(d=3)_ms=234.826 pf(d=4)_ms=228.537 pf(d=6)_ms=229.844 pf(d=8)_ms=226.449 pf(d=11)_ms=227.207 pf(d=15)_ms=226.548 pf(d=16)_ms=220.007 pf(d=18)_ms=219.356 pf(d=22)_ms=218.205 pf(d=24)_ms=227.689 pf(d=32)_ms=232.744 pf(d=64)_ms=231.79
stride=4 base_ms=234.121 pf(d=1)_ms=229.958 pf(d=2)_ms=232.267 pf(d=3)_ms=226.924 pf(d=4)_ms=225.065 pf(d=6)_ms=226.892 pf(d=8)_ms=227.638 pf(d=11)_ms=230.968 pf(d=15)_ms=227.39 pf(d=16)_ms=226.254 pf(d=18)_ms=229.8 pf(d=22)_ms=231.171 pf(d=24)_ms=226.145 pf(d=32)_ms=225.248 pf(d=64)_ms=225.825
stride=8 base_ms=255.898 pf(d=1)_ms=261.773 pf(d=2)_ms=251.783 pf(d=3)_ms=260.101 pf(d=4)_ms=260.173 pf(d=6)_

In [11]:
!./main

stride=1 base_ms=237.765 pf(d=1)_ms=270.022 pf(d=2)_ms=255.575 pf(d=3)_ms=255.438 pf(d=4)_ms=256.18 pf(d=6)_ms=249.789 pf(d=8)_ms=250.065 pf(d=11)_ms=269.177 pf(d=15)_ms=250.234 pf(d=16)_ms=245.459 pf(d=18)_ms=255.07 pf(d=22)_ms=267.191 pf(d=24)_ms=261.789 pf(d=32)_ms=251.204 pf(d=64)_ms=246.697
stride=2 base_ms=236.869 pf(d=1)_ms=230.706 pf(d=2)_ms=227.284 pf(d=3)_ms=227.825 pf(d=4)_ms=228.248 pf(d=6)_ms=236.618 pf(d=8)_ms=228.101 pf(d=11)_ms=226.137 pf(d=15)_ms=220.049 pf(d=16)_ms=223.762 pf(d=18)_ms=226.12 pf(d=22)_ms=220.32 pf(d=24)_ms=232.55 pf(d=32)_ms=230.336 pf(d=64)_ms=239.048
stride=4 base_ms=235.271 pf(d=1)_ms=227.971 pf(d=2)_ms=228.932 pf(d=3)_ms=234.199 pf(d=4)_ms=231.168 pf(d=6)_ms=226.106 pf(d=8)_ms=226.604 pf(d=11)_ms=226.137 pf(d=15)_ms=237.838 pf(d=16)_ms=226.527 pf(d=18)_ms=226.332 pf(d=22)_ms=225.726 pf(d=24)_ms=232.795 pf(d=32)_ms=230.591 pf(d=64)_ms=223.382
stride=8 base_ms=257.653 pf(d=1)_ms=261.366 pf(d=2)_ms=264.639 pf(d=3)_ms=262.038 pf(d=4)_ms=262.137 pf(d=6)

In [12]:
!./main

stride=1 base_ms=250.278 pf(d=1)_ms=258.267 pf(d=2)_ms=261.74 pf(d=3)_ms=254.944 pf(d=4)_ms=256.003 pf(d=6)_ms=264.055 pf(d=8)_ms=259.202 pf(d=11)_ms=242.686 pf(d=15)_ms=243.572 pf(d=16)_ms=242.01 pf(d=18)_ms=242.287 pf(d=22)_ms=241.808 pf(d=24)_ms=243.621 pf(d=32)_ms=222.828 pf(d=64)_ms=245.141
stride=2 base_ms=236.403 pf(d=1)_ms=230.927 pf(d=2)_ms=228.701 pf(d=3)_ms=229.122 pf(d=4)_ms=233.873 pf(d=6)_ms=231.692 pf(d=8)_ms=230.392 pf(d=11)_ms=226.428 pf(d=15)_ms=224.486 pf(d=16)_ms=223.959 pf(d=18)_ms=223.356 pf(d=22)_ms=219.354 pf(d=24)_ms=224.568 pf(d=32)_ms=235.123 pf(d=64)_ms=233.533
stride=4 base_ms=237.269 pf(d=1)_ms=228.493 pf(d=2)_ms=230.433 pf(d=3)_ms=232.966 pf(d=4)_ms=225.761 pf(d=6)_ms=226.366 pf(d=8)_ms=226.212 pf(d=11)_ms=231.946 pf(d=15)_ms=231.403 pf(d=16)_ms=228.203 pf(d=18)_ms=227.248 pf(d=22)_ms=227.588 pf(d=24)_ms=235.948 pf(d=32)_ms=225.232 pf(d=64)_ms=224.253
stride=8 base_ms=260.668 pf(d=1)_ms=271.342 pf(d=2)_ms=258.897 pf(d=3)_ms=265.092 pf(d=4)_ms=310.171 pf(d