In [12]:
import mmap
import os
import csv

import numpy as np
import pandas as pd
from tqdm import tqdm

import data
np.set_printoptions(suppress=True, linewidth=np.inf) 

# https://blog.nelsonliu.me/2016/07/30/progress-bars-for-python-file-reading-with-tqdm/
# This is used for the progress bar so we can keep track of the progress
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    fp.close()
    return lines

In [4]:
input_path = '/home/jperez/data/sled/300.csv'
output_dir = '/home/jperez/data/sled300/'
columns_to_save = data.COLUMNS_RAW

# Overall dataset statistics
row_counts = []
header_counts = []
plane_timesteps = []

with open(input_path, 'r') as file:
    reader = csv.reader(file)
    p_bar = tqdm(desc='Splitting big CSV file into individual timesteps', postfix='Timestep 0', total=get_num_lines(input_path))

    data_header = []
    data_rows = []

    prev_timestep = 0

    for row in reader:
        # Update progress bar
        p_bar.update(1)
        # Skip empty rows
        if len(row) == 0:
            continue
        # Look for the start of a new file
        elif row[0] == '[Name]':
            # Check if we have data to save
            if len(data_rows) > 0:
                # Save some dataset statistics to analyze later
                header_counts.append(len(data_header))
                row_counts.append(len(data_rows))

                # Create dataframe from the data and save it
                df = pd.DataFrame(data=data_rows, columns=data_header, dtype=np.float64)
                df = df.rename(columns=lambda col: col.strip())
                df['Timestep'] = current_timestep

                # Save CSV if you want
                # df.to_csv(os.path.join(output_dir, f'{prev_timestep}.csv'), index=False, columns=columns_to_save)
                
                # Saving to NP file format is faster
                np.save(os.path.join(output_dir, f'{prev_timestep}'), df[columns_to_save].to_numpy().astype(np.float64))

                # Empty data lists for next CSV file
                data_header = []
                data_rows = []
        # Get the data header for this file
        elif row[0] == '[Data]':
            data_header = next(reader)
        # Check what timestep we are on
        elif row[0][:5] == 'Plane':
            # This will get 00001.250 and split it
            ts_split = row[0][-8:].split('.')
            # From the split, get the timestep as an integer
            try: 
                current_timestep = int(ts_split[0])
            except:
                try:
                    current_timestep = int(row[0][-4:])
                except:
                    print(f'Could not process timestep in row={row}')
                    current_timestep = None

            if current_timestep:
                # Compare against previous timestep to see if we jumped more than 1
                if current_timestep - prev_timestep > 1:
                    print(f'Jumped from timestep {prev_timestep} to timestep {current_timestep}')

                # Update variables and progress bar
                prev_timestep = current_timestep
                plane_timesteps.append(current_timestep)
                p_bar.postfix = f'Timestep {current_timestep}'
        # Otherwise just collect data rows
        else:
            data_rows.append(row)

Splitting big CSV file into individual timesteps: 100%|█████████▉| 10542427/10543170 [04:50<00:00, 36280.24it/s, Timestep 760] 
  exec(code_obj, self.user_global_ns, self.user_ns)


Could not process timestep in row=['Plane 1 in Case data.300 0001.1']
Could not process timestep in row=['Plane 1 in Case data.300 0002.1']




Could not process timestep in row=['Plane 1 in Case data.300 0003.1']




Could not process timestep in row=['Plane 1 in Case data.300 0004.1']
Could not process timestep in row=['Plane 1 in Case data.300 0005.1']




Could not process timestep in row=['Plane 1 in Case data.300 0006.1']
Could not process timestep in row=['Plane 1 in Case data.300 0007.1']




Could not process timestep in row=['Plane 1 in Case data.300 0008.1']
Could not process timestep in row=['Plane 1 in Case data.300 0009.1']




Could not process timestep in row=['Plane 1 in Case data.300 0010.1']
Could not process timestep in row=['Plane 1 in Case data.300 0011.1']




Could not process timestep in row=['Plane 1 in Case data.300 0012.1']
Could not process timestep in row=['Plane 1 in Case data.300 0013.1']




Could not process timestep in row=['Plane 1 in Case data.300 0014.1']
Could not process timestep in row=['Plane 1 in Case data.300 0015.1']




Could not process timestep in row=['Plane 1 in Case data.300 0016.1']




Could not process timestep in row=['Plane 1 in Case data.300 0017.1']
Could not process timestep in row=['Plane 1 in Case data.300 0018.1']




Could not process timestep in row=['Plane 1 in Case data.300 0019.1']
Could not process timestep in row=['Plane 1 in Case data.300 0020.1']




Could not process timestep in row=['Plane 1 in Case data.300 0021.1']
Could not process timestep in row=['Plane 1 in Case data.300 0022.1']




Could not process timestep in row=['Plane 1 in Case data.300 0023.1']
Could not process timestep in row=['Plane 1 in Case data.300 0024.1']




Could not process timestep in row=['Plane 1 in Case data.300 0025.1']
Could not process timestep in row=['Plane 1 in Case data.300 0026.1']




Could not process timestep in row=['Plane 1 in Case data.300 0027.1']
Could not process timestep in row=['Plane 1 in Case data.300 0028.1']




Could not process timestep in row=['Plane 1 in Case data.300 0029.1']




Could not process timestep in row=['Plane 1 in Case data.300 0030.1']
Could not process timestep in row=['Plane 1 in Case data.300 0031.1']




Could not process timestep in row=['Plane 1 in Case data.300 0032.1']
Could not process timestep in row=['Plane 1 in Case data.300 0033.1']




Could not process timestep in row=['Plane 1 in Case data.300 0034.1']




Could not process timestep in row=['Plane 1 in Case data.300 0035.1']




Could not process timestep in row=['Plane 1 in Case data.300 0036.1']




Could not process timestep in row=['Plane 1 in Case data.300 0037.1']




Could not process timestep in row=['Plane 1 in Case data.300 0038.1']
Could not process timestep in row=['Plane 1 in Case data.300 0039.1']




Could not process timestep in row=['Plane 1 in Case data.300 0040.1']
Could not process timestep in row=['Plane 1 in Case data.300 0041.1']




Could not process timestep in row=['Plane 1 in Case data.300 0042.1']




Could not process timestep in row=['Plane 1 in Case data.300 0043.1']




Could not process timestep in row=['Plane 1 in Case data.300 0044.1']




Could not process timestep in row=['Plane 1 in Case data.300 0045.1']
Could not process timestep in row=['Plane 1 in Case data.300 0046.1']




Could not process timestep in row=['Plane 1 in Case data.300 0047.1']
Could not process timestep in row=['Plane 1 in Case data.300 0048.1']




Could not process timestep in row=['Plane 1 in Case data.300 0049.1']




Could not process timestep in row=['Plane 1 in Case data.300 0050.1']




Could not process timestep in row=['Plane 1 in Case data.300 0051.1']
Could not process timestep in row=['Plane 1 in Case data.300 0052.1']




Could not process timestep in row=['Plane 1 in Case data.300 0053.1']
Could not process timestep in row=['Plane 1 in Case data.300 0054.1']


KeyboardInterrupt: 



In [150]:
row

['Plane 1 in Case FFF']

In [3]:
print('Check that all files have the same number of rows', np.unique(row_counts))
print('Check that all files have the same header length', np.unique(header_counts))
print(f'From {plane_timesteps[0]} to {plane_timesteps[-1]}')

Check that all files have the same number of rows [14184]
Check that all files have the same header length [22]
From 19 to 760


In [17]:
X, Y = data.read_np('/home/jperez/data/sled250/201.npy', ['X', 'Y', 'T'], ['Vu', 'Vv'], scaler=None)

print(X.shape, Y.shape)
print(X[:5])
Y[:5]

(14184, 3) (14184, 2)
[[  0.           0.25       201.        ]
 [  0.           0.27500001 201.        ]
 [ -0.025        0.27500001 201.        ]
 [ -0.025        0.25       201.        ]
 [  0.           0.22499999 201.        ]]


array([[249.889465  ,   0.26969039],
       [249.883118  ,   0.42404363],
       [250.        ,   0.        ],
       [250.        ,   0.        ],
       [249.901443  ,   0.13346604]])

In [16]:
X, Y = data.read_np('/home/jperez/data/sled255/20.npy', ['X', 'Y', 'T'], ['Vu', 'Vv'], scaler=None)

print(X.shape, Y.shape)
print(X[:5])
Y[:5]

(14184, 3) (14184, 2)
[[ 0.          0.          0.25      ]
 [ 0.          0.          0.27500001]
 [-0.025      -0.025       0.27500001]
 [-0.025      -0.025       0.25      ]
 [ 0.          0.          0.22499999]]


array([[    20.   , 105350.641],
       [    20.   , 105225.344],
       [    20.   , 105178.828],
       [    20.   , 105291.148],
       [    20.   , 105422.141]])