# GULLS

## Planet Files

In [3]:
import os
import math
import numpy as np
import joblib
import pandas as pd
import fcntl
import time

job_keys = ['A', 'B', 'C']
rl = np.array([100000, 10000, 1000000]) # walkers
nr = np.array([100, 1000, 10]) # steps
print(rl*nr)
gulls_data_dir = "./Data/GULLS"
time_taken = {}

[10000000 10000000 10000000]


### Speeds Test Rsults for 10,000,000 Planets

file arrangements:
+ A: 100,000 entries in 100 files  = 10,000,000 planets
+ B: 10,000 entries in 1000 files  = 10,000,000 planets
+ C: 1,000,000 entries in 10 files = 10,000,000 planets


In [11]:
# this needs to be run after the cells that follow

# size of each run's output files (joblistA = joblistB = joblistC, etc.) manually entered
joblib_times['size'] = '320 MB'
pandas_times['size'] = '780 MB'  
npsavetxt_times['size'] = '1 GB'  
npy_times['size'] = '320 MB'

poolandnpsave_times = {
    'A': 5.45,
    'B': np.nan,
    'C': np.nan,
    'size': '1 GB'
}
time_taken['poolandnpsave'] = poolandnpsave_times

time_df = pd.DataFrame(time_taken)
print(time_df)


        joblib     pandas  npsavetxt       npy poolandnpsave
A     0.522638  22.600857   17.75738  0.507046          5.45
B     0.633548  23.629492   18.31468  0.787341           NaN
C     0.566044  22.729151  17.804965  0.603358           NaN
size    320 MB     780 MB       1 GB    320 MB          1 GB


The time taken appears mostly dependent on the save method chosen and scales by the total number of lines, not by how the files are broken up. The human readable outputs are about 40 times slower to produce than the machine readable outputs. The human readable files are about 3 times the size. How do the pearl scripts perform, by comparison? \

I'm going to try parallelising the nf loop.

In [1]:
!python uniform_draw_planet_arrays.py 100000 100

  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
Execution time: 5.45 seconds


## Speed Test Code Cells

In [14]:
# joblib

# Constants
mmin = math.log10(0.1)
mmax = math.log10(100)
rundes = "kgriz_uf_ffp"
amin = math.log10(0.3)
amax = amin + 2
pi = math.pi


def get_unique_indexes(master_list_file, n):
    '''
    This function exists so that multiplt versions of this script can be run at once without 
    overlapping file indexing. It reserves n indexes in the master list file and returns them.
    '''
    with open(master_list_file, 'a+') as f:
        f.seek(0)
        fcntl.flock(f, fcntl.LOCK_EX)
        lines = f.readlines()
        if lines:
            start_index = int(lines[-1].strip()) + 1
        else:
            start_index = 0
        end_index = start_index + n
        for index in range(start_index, end_index):
            f.write(f"{index}\n")
        fcntl.flock(f, fcntl.LOCK_UN)
    return list(range(start_index, end_index))

# Main logic
def main(nf, nl):
    dir_name = f"{gulls_data_dir}/planets/{rundes}"
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    base = f"{dir_name}/{rundes}.planets"
    master_list_file = f"{base}.master.lists"

    # Ensure the master list file exists
    if not os.path.exists(master_list_file):
        open(master_list_file, 'w').close()

    # Reserve the next nf indexes
    reserved_indexes = get_unique_indexes(master_list_file, nf)

    for index in reserved_indexes:
        pfile = f"{base}.{index}.joblib"
        if os.path.exists(pfile):
            print(f"File {pfile} already exists. Skipping.")
        else:
            combined_array = np.empty((nl, 4))
            rnd = np.random.rand(nl)
            
            combined_array[:, 0] = 3.00374072e-6 * 10.0 ** (mmin + np.random.rand(nl) * (mmax - mmin))
            combined_array[:, 1] = 10.0 ** (amin + (amax - amin) * np.random.rand(nl))
            combined_array[:, 2] = 180.0 * np.where(rnd < 0.5, 
                                         np.arccos(2.0 * rnd), 
                                         -np.arccos(2.0 - 2.0 * rnd)
                                         ) / pi
            combined_array[:, 3] = 360.0 * np.random.rand(nl)
            
            # Saving data
            joblib.dump(combined_array, pfile)

joblib_times = {}
for i, key in enumerate(job_keys):
    start_time = time.time()
    nl = rl[i]  
    nf = nr[i]
    main(nf, nl) 
    end_time = time.time()
    joblib_times[key] = end_time - start_time
    print(f"Time taken: {end_time - start_time} seconds")
    print(f"files: {nf}")
    print(f"lines: {nl}")
    print(f"planets: {nf*nl}")

time_taken['joblib'] = joblib_times

  np.arccos(2.0 * rnd),
  -np.arccos(2.0 - 2.0 * rnd)


Time taken: 0.5487310886383057 seconds
files: 100
lines: 100000
planets: 10000000
Time taken: 0.9095323085784912 seconds
files: 1000
lines: 10000
planets: 10000000
Time taken: 0.648413896560669 seconds
files: 10
lines: 1000000
planets: 10000000


In [6]:
# pandas

# Constants
mmin = math.log10(0.1)
mmax = math.log10(100)
rundes = "kgriz_uf_ffp"
amin = math.log10(0.3)
amax = amin + 2
pi = math.pi

def get_unique_indexes(master_list_file, n):
    '''
    This function exists so that multiplt versions of this script can be run at once without 
    overlapping file indexing. It reserves n indexes in the master list file and returns them.
    '''
    with open(master_list_file, 'a+') as f:
        f.seek(0)
        fcntl.flock(f, fcntl.LOCK_EX)
        lines = f.readlines()
        if lines:
            start_index = int(lines[-1].strip()) + 1
        else:
            start_index = 0
        end_index = start_index + n
        for index in range(start_index, end_index):
            f.write(f"{index}\n")
        fcntl.flock(f, fcntl.LOCK_UN)
    return list(range(start_index, end_index))

# Main logic
def main(nf, nl):
    dir_name = f"{gulls_data_dir}/planets/{rundes}"
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    base = f"{dir_name}/{rundes}.planets"
    master_list_file = f"{base}.master.lists"

    # Ensure the master list file exists
    if not os.path.exists(master_list_file):
        open(master_list_file, 'w').close()

    # Reserve the next nf indexes
    reserved_indexes = get_unique_indexes(master_list_file, nf)

    for index in reserved_indexes:
        pfile = f"{base}.{index}.csv"
        if os.path.exists(pfile):
            print(f"File {pfile} already exists. Skipping.")
        else:
            # Generate arrays of size nl using NumPy
            a_array = 10 ** (amin + (amax - amin) * np.random.rand(nl))
            mass_array = 3.00374072e-6 * 10 ** (mmin + np.random.rand(nl) * (mmax - mmin))
            rnd = np.random.rand(nl)
            inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
            p_array = 360.0 * np.random.rand(nl)
            
            # Combine arrays into a single array
            combined_array = np.empty((nl, 4))
            combined_array[:, 0] = mass_array
            combined_array[:, 1] = a_array
            combined_array[:, 2] = inc_array
            combined_array[:, 3] = p_array
            
            # Convert to pandas DataFrame
            df = pd.DataFrame(combined_array, columns=['mass', 'a', 'inc', 'p'])
            
            # Save the DataFrame as a CSV file
            df.to_csv(pfile, index=False)

pandas_times = {}
for i, key in enumerate(job_keys):
    start_time = time.time()
    nl = rl[i]  
    nf = nr[i]
    main(nf, nl) 
    end_time = time.time()
    pandas_times[key] = end_time - start_time
    print(f"Time taken: {end_time - start_time} seconds")
    print(f"files: {nf}")
    print(f"lines: {nl}")
    print(f"planets: {nf*nl}")

time_taken['pandas'] = pandas_times


  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi


Time taken: 22.60085678100586 seconds
files: 100
lines: 100000
planets: 10000000
Time taken: 23.629492044448853 seconds
files: 1000
lines: 10000
planets: 10000000
Time taken: 22.729151010513306 seconds
files: 10
lines: 1000000
planets: 10000000


In [7]:
# numpy.savetext

# Constants
mmin = math.log10(0.1)
mmax = math.log10(100)
rundes = "kgriz_uf_ffp"
amin = math.log10(0.3)
amax = amin + 2
pi = math.pi

def get_unique_indexes(master_list_file, n):
    '''
    This function exists so that multiplt versions of this script can be run at once without 
    overlapping file indexing. It reserves n indexes in the master list file and returns them.
    '''
    with open(master_list_file, 'a+') as f:
        f.seek(0)
        fcntl.flock(f, fcntl.LOCK_EX)
        lines = f.readlines()
        if lines:
            start_index = int(lines[-1].strip()) + 1
        else:
            start_index = 0
        end_index = start_index + n
        for index in range(start_index, end_index):
            f.write(f"{index}\n")
        fcntl.flock(f, fcntl.LOCK_UN)
    return list(range(start_index, end_index))

# Main logic
def main(nf, nl):
    dir_name = f"{gulls_data_dir}/planets/{rundes}"
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    base = f"{dir_name}/{rundes}.planets"
    master_list_file = f"{base}.master.lists"

    # Ensure the master list file exists
    if not os.path.exists(master_list_file):
        open(master_list_file, 'w').close()

    # Reserve the next nf indexes
    reserved_indexes = get_unique_indexes(master_list_file, nf)

    for index in reserved_indexes:
        pfile = f"{base}.{index}.csv"
        if os.path.exists(pfile):
            print(f"File {pfile} already exists. Skipping.")
        else:
            # Generate arrays of size nl using NumPy
            a_array = 10 ** (amin + (amax - amin) * np.random.rand(nl))
            mass_array = 3.00374072e-6 * 10 ** (mmin + np.random.rand(nl) * (mmax - mmin))
            rnd = np.random.rand(nl)
            inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
            p_array = 360.0 * np.random.rand(nl)
            
            # Combine arrays into a single array
            combined_array = np.empty((nl, 4))
            combined_array[:, 0] = mass_array
            combined_array[:, 1] = a_array
            combined_array[:, 2] = inc_array
            combined_array[:, 3] = p_array
            
            # Save the combined array as a CSV file
            np.savetxt(pfile, combined_array, delimiter=',', header='mass,a,inc,p', comments='')

npsavetxt_times = {}
for i, key in enumerate(job_keys):
    start_time = time.time()
    nl = rl[i]  
    nf = nr[i]
    main(nf, nl) 
    end_time = time.time()
    npsavetxt_times[key] = end_time - start_time
    print(f"Time taken: {end_time - start_time} seconds")
    print(f"files: {nf}")
    print(f"lines: {nl}")
    print(f"planets: {nf*nl}")

time_taken['npsavetxt'] = npsavetxt_times

  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi


Time taken: 17.75737977027893 seconds
files: 100
lines: 100000
planets: 10000000
Time taken: 18.314679861068726 seconds
files: 1000
lines: 10000
planets: 10000000
Time taken: 17.804965257644653 seconds
files: 10
lines: 1000000
planets: 10000000


In [8]:
# numpy

# Constants
mmin = math.log10(0.1)
mmax = math.log10(100)
rundes = "kgriz_uf_ffp"
amin = math.log10(0.3)
amax = amin + 2
pi = math.pi

def get_unique_indexes(master_list_file, n):
    '''
    This function exists so that multiplt versions of this script can be run at once without 
    overlapping file indexing. It reserves n indexes in the master list file and returns them.
    '''
    with open(master_list_file, 'a+') as f:
        f.seek(0)
        fcntl.flock(f, fcntl.LOCK_EX)
        lines = f.readlines()
        if lines:
            start_index = int(lines[-1].strip()) + 1
        else:
            start_index = 0
        end_index = start_index + n
        for index in range(start_index, end_index):
            f.write(f"{index}\n")
        fcntl.flock(f, fcntl.LOCK_UN)
    return list(range(start_index, end_index))

# Main logic
def main(nf, nl):
    dir_name = f"{gulls_data_dir}/planets/{rundes}"
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    base = f"{dir_name}/{rundes}.planets"
    master_list_file = f"{base}.master.lists"

    # Ensure the master list file exists
    if not os.path.exists(master_list_file):
        open(master_list_file, 'w').close()

    # Reserve the next nf indexes
    reserved_indexes = get_unique_indexes(master_list_file, nf)

    for index in reserved_indexes:
        pfile = f"{base}.{index}.npy"
        if os.path.exists(pfile):
            print(f"File {pfile} already exists. Skipping.")
        else:
            # Generate arrays of size nl using NumPy
            a_array = 10 ** (amin + (amax - amin) * np.random.rand(nl))
            mass_array = 3.00374072e-6 * 10 ** (mmin + np.random.rand(nl) * (mmax - mmin))
            rnd = np.random.rand(nl)
            inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi
            p_array = 360.0 * np.random.rand(nl)
            
            # Combine arrays into a single array
            combined_array = np.empty((nl, 4))
            combined_array[:, 0] = mass_array
            combined_array[:, 1] = a_array
            combined_array[:, 2] = inc_array
            combined_array[:, 3] = p_array
            
            # Save the combined array as a CSV file
            np.save(pfile, combined_array)

npy_times = {}
for i, key in enumerate(job_keys):
    start_time = time.time()
    nl = rl[i]  
    nf = nr[i]
    main(nf, nl) 
    end_time = time.time()
    npy_times[key] = end_time - start_time
    print(f"Time taken: {end_time - start_time} seconds")
    print(f"files: {nf}")
    print(f"lines: {nl}")
    print(f"planets: {nf*nl}")

time_taken['npy'] = npy_times

  inc_array = 180 * np.where(rnd < 0.5, np.arccos(2 * rnd), -np.arccos(2 - 2 * rnd)) / pi


Time taken: 0.5070459842681885 seconds
files: 100
lines: 100000
planets: 10000000
Time taken: 0.7873411178588867 seconds
files: 1000
lines: 10000
planets: 10000000
Time taken: 0.6033580303192139 seconds
files: 10
lines: 1000000
planets: 10000000


In [None]:
# pearl

import subprocess

start_time = time.time()

# Define the path to your Perl script
perl_script_path = 'Assets/scriptA.pl'  # I don't have this script working

# Run the Perl script
result = subprocess.run(['perl', perl_script_path], capture_output=True, text=True, esult = subprocess.run(['perl', perl_script_path], stdout=stdout_log, stderr=stderr_log)

end_time = time.time()