# Analyse Random Number Generation

* Check for duplicates
* Check number distributions
    

In [25]:
import numpy as np
import os
import subprocess


def get_random_numbers(settings: str, type):

    with open('settings', mode='w') as fid:
        fid.write(settings)
    
    result = subprocess.run(['./test_random'], capture_output=True, check=True, encoding="utf-8")
    os.remove('settings')
    
    # Parse result into numpy, avoiding last line, which is blank
    random_values = np.empty(shape=n_random, dtype=type)
    for i, line in enumerate(result.stdout.split('\n')[:-1]):
        random_values[i] = type(line.split()[-1])
        
    return random_values
        


In [26]:
""" Clearly fine due to the floating point precision, allowing for small differences in two reals.
"""
# Real64 tests
n_random = 10000

for r_max in [10., 100., 1000.]:
    # Dump then remove
    settings = f"""real64
    {n_random}
    1 {r_max}
    """
    
    random_reals = get_random_numbers(settings, float)
    
    # Check unique values
    print(f'Range [1, {r_max}]. Number of unique values {np.unique(random_reals).shape} / {n_random}')


Range [1, 10.0]. Number of unique values (10000,) / 10000
Range [1, 100.0]. Number of unique values (10000,) / 10000
Range [1, 1000.0]. Number of unique values (10000,) / 10000


In [27]:
""" Even with a normal distribution of pseudo-random numbers and i_max - i_min >> total number of random 
numbers asked for (by an order of magnitude), I am still getting ~ 10% of the return values as duplicates.
"""
# Integer tests
n_random = 10000

i_max_values = [10, 100, 1000, 2000, 5000, 10000, 100000]

for i_max in i_max_values:
    # Dump then remove
    settings = f"""int32
    {n_random}
    1 {i_max}
    """
    
    random_integers = get_random_numbers(settings, int)
    
    # Check unique values
    print(f'Range [1, {i_max}]. Number of unique values {np.unique(random_integers).shape} / {n_random}')


Range [1, 10]. Number of unique values (5,) / 10000
Range [1, 100]. Number of unique values (50,) / 10000
Range [1, 1000]. Number of unique values (500,) / 10000
Range [1, 2000]. Number of unique values (1000,) / 10000
Range [1, 5000]. Number of unique values (2460,) / 10000
Range [1, 10000]. Number of unique values (4328,) / 10000
Range [1, 100000]. Number of unique values (9042,) / 10000


## Discussion: Sampling Integers from a finite range

Simplest solution for the application I want (still prototyping) is to not scale the integers
and reject those random values > i_max and < i_min. If s_max >> i_max, this could be extremely 
inefficient.

Next best approach is reservoir sampling



In [None]:
# Call my reservoir sampling, for up to 1 million points, and time it.
# Check for uniqueness 
# Plot the distributions
# Repeat ~ 5 times using the random seed generated by fortran
