In [1]:
import mmap
import os
import csv

import numpy as np
import pandas as pd
from tqdm import tqdm

import data

# https://blog.nelsonliu.me/2016/07/30/progress-bars-for-python-file-reading-with-tqdm/
# This is used for the progress bar so we can keep track of the progress
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    fp.close()
    return lines

In [None]:
input_path = '/home/jperez/data/sled/255.csv'
output_dir = '/home/jperez/data/sled255/'
columns_to_save = data.COLUMNS_RAW

# Overall dataset statistics
row_counts = []
header_counts = []
plane_timesteps = []

with open(input_path, 'r') as file:
    reader = csv.reader(file)
    p_bar = tqdm(desc='Splitting big CSV file into individual timesteps', postfix='Timestep 0', total=get_num_lines(input_path))

    data_header = []
    data_rows = []

    prev_timestep = 0

    for row in reader:
        # Update progress bar
        p_bar.update(1)
        # Skip empty rows
        if len(row) == 0:
            continue
        # Look for the start of a new file
        elif row[0] == '[Name]':
            # Check if we have data to save
            if len(data_rows) > 0:
                # Save some dataset statistics to analyze later
                header_counts.append(len(data_header))
                row_counts.append(len(data_rows))

                # Create dataframe from the data and save it
                df = pd.DataFrame(data=data_rows, columns=data_header, dtype=np.float64)
                df = df.rename(columns=lambda col: col.strip())

                # Save CSV if you want
                # df.to_csv(os.path.join(output_dir, f'{prev_timestep}.csv'), index=False, columns=columns_to_save)
                
                # Saving to NP file format is faster
                np.save(os.path.join(output_dir, f'{prev_timestep}'), df[columns_to_save].to_numpy().astype(np.float64))

                # Empty data lists for next CSV file
                data_header = []
                data_rows = []
        # Get the data header for this file
        elif row[0] == '[Data]':
            data_header = next(reader)
        # Check what timestep we are on
        elif row[0][:5] == 'Plane':
            # This will get 00001.250 and split it
            ts_split = row[0][-8:].split('.')
            # From the split, get the timestep as an integer
            try: 
                current_timestep = int(ts_split[0])
            except:
                try:
                    current_timestep = int(row[0][-4:])
                except:
                    print(f'Could not process timestep in row={row}')
                    current_timestep = None

            if current_timestep:
                # Compare against previous timestep to see if we jumped more than 1
                if current_timestep - prev_timestep > 1:
                    print(f'Jumped from timestep {prev_timestep} to timestep {current_timestep}')

                # Update variables and progress bar
                prev_timestep = current_timestep
                plane_timesteps.append(current_timestep)
                p_bar.postfix = f'Timestep {current_timestep}'
        # Otherwise just collect data rows
        else:
            data_rows.append(row)

In [150]:
row

['Plane 1 in Case FFF']

In [4]:
print('Check that all files have the same number of rows', np.unique(row_counts))
print('Check that all files have the same header length', np.unique(header_counts))
print(f'From {plane_timesteps[0]} to {plane_timesteps[-1]}')

Check that all files have the same number of rows [14184]
Check that all files have the same header length [22]
From 19 to 760
