# Test speed of loading different filetypes

In [1]:
import os
from timeit import default_timer

import numpy as np
import h5py

import plotly.express as px

In [2]:
data_path = os.path.normpath("../../data")
folders = [os.path.join(data_path, "gen_plasma_n64", f"mat_{i}") for i in range(3)]
lens = [int(len(list(filter(lambda x: x.endswith(".csv"), os.listdir(f)))) / 4) for f in folders]

print(folders, lens)

['..\\..\\data\\gen_plasma_n64\\mat_0', '..\\..\\data\\gen_plasma_n64\\mat_1', '..\\..\\data\\gen_plasma_n64\\mat_2'] [8145, 8145, 5685]


## CSV format

### Loading one matrix

In [3]:
times_csv = []

for i, folder in enumerate(folders):
    time = 0

    for f in range(lens[i]):
        path = os.path.join(folder, f"f_{f}.csv")

        start_time = default_timer()
        tstload = np.genfromtxt(path, delimiter=",", dtype=float)
        time += default_timer() - start_time

    times_csv.append(time / lens[i])
    print(times_csv)

[0.009987015310303148]
[0.009987015310303148, 0.010965053861171945]
[0.009987015310303148, 0.010965053861171945, 0.011608856218048308]


## Convert to HDF5

Only run this code once.

### Loading all 4 matrices

In [4]:
times_allm_csv = []

for i, folder in enumerate(folders):
    time = 0
    
    for f in range(lens[i]):
        paths = [os.path.join(folder, f"{s}_{f}.csv") for s in ("f", "S", "U", "V")]

        start_time = default_timer()
        m_f = np.genfromtxt(paths[0], delimiter=",", dtype=float)
        m_S = np.genfromtxt(paths[1], delimiter=",", dtype=float)
        m_U = np.genfromtxt(paths[2], delimiter=",", dtype=float)
        m_V = np.genfromtxt(paths[3], delimiter=",", dtype=float)
        time += default_timer() - start_time

        # with h5py.File(os.path.join(folder, f'{f}.hdf5'), 'w') as file:
        #         file.create_dataset("f", data=m_f)
        #         file.create_dataset("S", data=m_S)
        #         file.create_dataset("U", data=m_U)
        #         file.create_dataset("V", data=m_V)

    times_allm_csv.append(time / lens[i])
    print(f"Done {folder}, times: {times_allm_csv}")


Done ..\..\data\gen_plasma_n64\mat_0, times: [0.032473367710285944]
Done ..\..\data\gen_plasma_n64\mat_1, times: [0.032473367710285944, 0.030567587562879587]
Done ..\..\data\gen_plasma_n64\mat_2, times: [0.032473367710285944, 0.030567587562879587, 0.030007194669973744]


### Convert bigger to HDF5

Run once.

In [None]:
folder_n256 = os.path.join(data_path, "gen_plasma_n256", "mat_1")
len_n256 = int(len(list(filter(lambda x: x.endswith(".csv"), os.listdir(folder_n256)))))

print(folder_n256, len_n256)

out_n256 = os.path.join(data_path, "gen_plasma_n256", "mat_hdf5")
if not os.path.exists(out_n256):
    os.mkdir(out_n256)

for i in range(len_n256):
    path = os.path.join(folder_n256, f"f_{i}.csv")
    # paths = [os.path.join(folder_n256, f"{s}_{i}.csv") for s in ("f", "S", "U", "V")]

    m_f = np.genfromtxt(path, delimiter=",", dtype=float)

    with h5py.File(os.path.join(out_n256, f'{i}.hdf5'), 'w') as file:
            file.create_dataset("f", data=m_f)

    if i % 499 == 0:
        print(f"{i}/{len_n256}")

..\..\data\gen_plasma_n256\mat_1 32591
0/32591
499/32591
998/32591
1497/32591
1996/32591
2495/32591
2994/32591
3493/32591
3992/32591
4491/32591
4990/32591
5489/32591
5988/32591
6487/32591
6986/32591
7485/32591
7984/32591
8483/32591
8982/32591
9481/32591
9980/32591
10479/32591
10978/32591
11477/32591


## HDF5 Format

### Loading all 4 matrices

In [5]:
times_allm_hdf5 = []
keys = ("f", "S", "U", "V")

for i, folder in enumerate(folders):
    time = 0
    
    for f in range(lens[i]):
        path = os.path.join(folder, f"{f}.hdf5")

        start_time = default_timer()
        with h5py.File(path, 'r') as file:
            # The syntax actually retrtrieves and stores the data
            tstloads = [file[k][()] for k in keys]
        time += default_timer() - start_time

    times_allm_hdf5.append(time / lens[i])
    print(f"Done {folder}, times: {times_allm_hdf5}")

Done ..\..\data\gen_plasma_n64\mat_0, times: [0.0026504991034412]
Done ..\..\data\gen_plasma_n64\mat_1, times: [0.0026504991034412, 0.002526046420850444]
Done ..\..\data\gen_plasma_n64\mat_2, times: [0.0026504991034412, 0.002526046420850444, 0.0027727729989788694]


In [18]:
type(tstloads[0])

numpy.ndarray

### Loading one matrix

In [15]:
times_f_hdf5 = []
keys = ("f",)

for i, folder in enumerate(folders):
    time = 0
    
    for f in range(lens[i]):
        path = os.path.join(folder, f"{f}.hdf5")

        start_time = default_timer()
        with h5py.File(path, 'r') as file:
            # The syntax actually retrtrieves and stores the data
            tstloads = [file[k][()] for k in keys]
        time += default_timer() - start_time

    times_f_hdf5.append(time / lens[i])
    print(f"Done {folder}, times: {times_f_hdf5}")

Done ..\..\data\gen_plasma_n64\mat_0, times: [0.00019759734808593105]
Done ..\..\data\gen_plasma_n64\mat_1, times: [0.00019759734808593105, 0.0001917277594274762]
Done ..\..\data\gen_plasma_n64\mat_2, times: [0.00019759734808593105, 0.0001917277594274762, 0.00019187614769759206]


## MAT Format

Assumes that the mats were generated by Matlab; cannot use the original plasma gen code, need to actually load the csvs and save as `.mat`s becuse need to be the exact same data.

## Compare All

In [16]:
all_timings = dict(
    f_csv=[0.009987015310303148, 0.010965053861171945, 0.011608856218048308],
    f_hdf5=[0.00019759734808593105, 0.0001917277594274762, 0.00019187614769759206],
    allm_csv=[0.032473367710285944, 0.030567587562879587, 0.030007194669973744],
    allm_hdf5=[0.0026504991034412, 0.002526046420850444, 0.0027727729989788694],
)

all_timings

{'f_csv': [0.009987015310303148, 0.010965053861171945, 0.011608856218048308],
 'f_hdf5': [0.00019759734808593105,
  0.0001917277594274762,
  0.00019187614769759206],
 'allm_csv': [0.032473367710285944,
  0.030567587562879587,
  0.030007194669973744],
 'allm_hdf5': [0.0026504991034412,
  0.002526046420850444,
  0.0027727729989788694]}