In [1]:
import torch
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scienceplots
import imageio.v2 as imageio
import glob

from matplotlib import animation
from matplotlib.animation import PillowWriter

plt.style.use(['science', 'notebook'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  return torch._C._cuda_getDeviceCount() > 0


## TWO STREAM INSTABILITY USING TORCH

In [None]:
L  = 100    # Domain of the solution 0 <= x <= L  (in Debye lengths)
N  = 25000  # Number of electrons
J  = 1000   # Number of grid-points
vb = 5      # Beam velocity
n0 = N/L    # ion number density
dx = L/J

dt = 0.1    # time step  (in inverse plasma frequencies)
t_max = 150  # such that 0 <= t <= t_max
timesteps = int(t_max / dt)


# Check input parameters make sence:
if (N < 1) | (J < 2) | (L <= 0.) | (vb <= 0.) | (dt <= 0.) | (t_max <= 0.) | ((int (t_max / dt) / 10) < 1):
    print("Error - invalid input parameters")
   

| Goal                     | NumPy                                | PyTorch                                |
|--------------------------|---------------------------------------|----------------------------------------|
| Uniform sample [a, b)    | `np.random.uniform(a, b, size)`       | `(b - a) * torch.rand(size) + a`       |
| Normal sample (mean, σ)  | `np.random.normal(mean, std, size)`   | `torch.normal(mean, std, size)`        |
| Random ints [a, b)       | `np.random.randint(a, b, size)`       | `torch.randint(a, b, size)`            |


The following code takes a lot of time to run, despite it is the torch version of the numpy implementation. 

```Python

# Initial positions
r0 = torch.rand(N, device=device) * L

# Initial velocities with rejection sampling
def sample_velocity(v_b, tails_factor):
    f_max = 0.5 * (1.0 + np.exp(-2 * v_b**2))  # ok to use np.exp since v_b is a scalar
    vmin, vmax = -tails_factor * v_b, tails_factor * v_b

    velocities = torch.empty(N, device=device)
    i = 0
    while i < N:
        v_ = (vmax - vmin) * torch.rand(1, device=device) + vmin  # torch version
        beam_shape = 0.5 * (torch.exp(-0.5 * (v_ - v_b)**2) + torch.exp(-0.5 * (v_ + v_b)**2))
        gamma = torch.rand(1, device=device) * f_max
        if gamma <= beam_shape:
            velocities[i] = v_
            i += 1
    return velocities

v0 = sample_velocity(vb, 4)
```

This is because NumPy is highly optimized for scalar and small-batch CPU operations, and its core is implemented in C — so for modest sizes like N = 25,000, rejection sampling in NumPy can outperform PyTorch if you're not using vectorized operations. Thus, it's simpler to implementin numpy and then convert. 

### USING BATCHES

However, the original rejection sampling does one sample at a time — **very inefficient**, especially on a GPU where batch operations are massively faster.

By generating many random samples in batches, we:

1. Reduce the number of iterations in the while loop.

2. Allow operations like `torch.exp`, `torch.rand`, and masking to be applied in parallel over large arrays.

3. **Use the GPU (or SIMD instructions on CPU) as they were designed to be used** — on big chunks of data.

So batch_size controls how many samples we generate at once, hoping that many of them will be accepted in a single round.

**WHICH BATCH SIZE?** This is a heuristic value, for a real-world application or a more serious scenario, it is recommended to work on a benchmark script to test different batch sizes and chose the optimal value. For now, it will be set `batch_size = 100000` as a heuristic value: big enough to efficiently use the GPU, small enough to avoid memory overflow.


| Scenario                   | NumPy (loop)    | Torch (vectorized)            |
| -------------------------- | --------------- | ----------------------------- |
| Small N, CPU-only          | Probably faster | Slightly slower               |
| Large N (e.g., 100k+), CPU | About equal     | Faster with batching          |
| Large N, **GPU** available | ❌ CPU-bound     | ✅ Hugely faster with batching |




In [None]:
# Initial positions
r0 = torch.rand(N, device=device) * L

def sample_velocity_vectorized(v_b, tails_factor, N, batch_size=100000):
    f_max = 0.5 * (1.0 + np.exp(-2 * v_b**2))
    vmin, vmax = -tails_factor * v_b, tails_factor * v_b

    velocities = torch.empty(N, device=device)
    filled = 0

    while filled < N:
        v_ = (vmax - vmin) * torch.rand(batch_size, device=device) + vmin
        beam_shape = 0.5 * (torch.exp(-0.5 * (v_ - v_b)**2) + torch.exp(-0.5 * (v_ + v_b)**2))
        gamma = torch.rand(batch_size, device=device) * f_max
        accepted = v_[gamma <= beam_shape]

        num_to_fill = min(accepted.shape[0], N - filled)
        velocities[filled:filled+num_to_fill] = accepted[:num_to_fill]
        filled += num_to_fill

    return velocities
v0 = sample_velocity_vectorized(vb,4)
velocity_tags = abs(v0 - vb) < abs(v0 + vb) # False = left going