In [1]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm


In [2]:

# Function to process the file in chunks and write to Parquet
def process_file_to_parquet(file_path, batch_size):
    data = {}  # Initialize the dictionary inside the function
    current_column = None
    batch_number = 0

    with open(file_path, 'r') as file:
        for line in tqdm(file, desc="Processing file"):
            line = line.strip()  # Remove any extra whitespace
            if line.startswith("TBP:"):
                # If the line starts with "TBP:", it's a new column header
                current_column = line
                if current_column not in data:
                    data[current_column] = []  # Initialize an empty list for this column
            else:
                # Otherwise, it's a value for the current column
                if current_column is not None:
                    data[current_column].append(float(line))  # Convert to float and add to the column

            # Check if the batch size is reached
            if len(data.get(current_column, [])) >= batch_size:
                # Write the current batch to a Parquet file
                write_batch_to_parquet(data, batch_number)
                batch_number += 1
                # Reset the data dictionary for the next batch
                data = {key: [] for key in data.keys()}

        # Write any remaining data to a Parquet file
        if any(data.values()):
            write_batch_to_parquet(data, batch_number)

def write_batch_to_parquet(batch, batch_number):
    # Ensure all columns have the same length
    max_length = max(len(values) for values in batch.values())
    for column in batch:
        if len(batch[column]) < max_length:
            batch[column] += [None] * (max_length - len(batch[column]))  # Pad with None

    # Convert the batch to a PyArrow Table
    table = pa.Table.from_pydict(batch)
    # Save the batch to a Parquet file
    pq.write_table(table, f'output_batch_{batch_number}.parquet')
    print(f"Batch {batch_number} saved to output_batch_{batch_number}.parquet")


In [3]:
file_path = '../data/20210518_142318.lag'

# Initialize variables
batch_size = 10000  # Adjust batch size based on memory availability


In [4]:
# Process the file and write to Parquet in batches
process_file_to_parquet(file_path, batch_size)

print("All batches processed and saved.")

Processing file: 4722473it [00:03, 880761.92it/s] 

Batch 0 saved to output_batch_0.parquet


Processing file: 8076164it [00:05, 1219130.53it/s]

Batch 1 saved to output_batch_1.parquet


Processing file: 14979331it [00:09, 610587.49it/s] 

Batch 2 saved to output_batch_2.parquet


Processing file: 20578938it [00:13, 620510.94it/s] 

Batch 3 saved to output_batch_3.parquet


Processing file: 33757258it [00:20, 2062241.09it/s]

Batch 4 saved to output_batch_4.parquet


Processing file: 43417574it [00:26, 2078251.41it/s]

Batch 5 saved to output_batch_5.parquet


Processing file: 48015734it [00:33, 299919.58it/s] 

Batch 6 saved to output_batch_6.parquet


Processing file: 52404032it [00:35, 1876363.22it/s]

Batch 7 saved to output_batch_7.parquet


Processing file: 58473472it [00:40, 2069463.52it/s]

Batch 8 saved to output_batch_8.parquet


Processing file: 63735529it [00:45, 2028959.06it/s]

Batch 9 saved to output_batch_9.parquet


Processing file: 72907638it [00:52, 1829046.94it/s]

Batch 10 saved to output_batch_10.parquet


Processing file: 84235459it [01:01, 2064960.30it/s]

Batch 11 saved to output_batch_11.parquet


Processing file: 101819673it [01:18, 281715.27it/s] 

Batch 12 saved to output_batch_12.parquet


Processing file: 105636854it [01:20, 2070959.61it/s]

Batch 13 saved to output_batch_13.parquet


Processing file: 114345493it [01:26, 2927058.94it/s]

Batch 14 saved to output_batch_14.parquet


Processing file: 120469955it [01:31, 2852448.70it/s]

Batch 15 saved to output_batch_15.parquet


Processing file: 132972358it [01:39, 2848899.84it/s]

Batch 16 saved to output_batch_16.parquet


Processing file: 140600650it [01:46, 2910795.47it/s]

Batch 17 saved to output_batch_17.parquet


Processing file: 143856001it [01:51, 2179426.37it/s]

Batch 18 saved to output_batch_18.parquet


Processing file: 151473241it [01:57, 2884140.34it/s]

Batch 19 saved to output_batch_19.parquet


Processing file: 168204150it [02:07, 2912574.49it/s]

Batch 20 saved to output_batch_20.parquet


Processing file: 178469237it [02:15, 2892396.77it/s]

Batch 21 saved to output_batch_21.parquet


Processing file: 192495306it [02:25, 1981451.52it/s]

Batch 22 saved to output_batch_22.parquet


Processing file: 194147209it [02:32, 722948.96it/s] 

Batch 23 saved to output_batch_23.parquet


Processing file: 199122109it [02:38, 2785716.17it/s]

Batch 24 saved to output_batch_24.parquet


Processing file: 213045167it [02:48, 2952743.58it/s]

Batch 25 saved to output_batch_25.parquet


Processing file: 215716271it [02:55, 1461014.92it/s]

Batch 26 saved to output_batch_26.parquet


Processing file: 222316110it [03:03, 2606127.03it/s]

Batch 27 saved to output_batch_27.parquet


Processing file: 225508510it [03:09, 2049373.64it/s]

Batch 28 saved to output_batch_28.parquet


Processing file: 236273578it [03:19, 2786019.95it/s]

Batch 29 saved to output_batch_29.parquet


Processing file: 246803562it [03:29, 2844295.25it/s]

Batch 30 saved to output_batch_30.parquet


Processing file: 252638265it [03:38, 2763799.20it/s]

Batch 31 saved to output_batch_31.parquet


Processing file: 262949359it [03:48, 2753502.31it/s]

Batch 32 saved to output_batch_32.parquet


Processing file: 270886010it [03:58, 2909938.64it/s]

Batch 33 saved to output_batch_33.parquet


Processing file: 279420248it [04:08, 2978337.83it/s]

Batch 34 saved to output_batch_34.parquet


Processing file: 288558816it [04:19, 2924508.79it/s]

Batch 35 saved to output_batch_35.parquet


Processing file: 300905481it [04:31, 2879130.12it/s]

Batch 36 saved to output_batch_36.parquet


Processing file: 312724205it [04:43, 2877159.97it/s]

Batch 37 saved to output_batch_37.parquet


Processing file: 315713253it [04:52, 1519483.71it/s]

Batch 38 saved to output_batch_38.parquet


Processing file: 324023069it [05:03, 2922407.38it/s]

Batch 39 saved to output_batch_39.parquet


Processing file: 337469876it [05:16, 2957599.68it/s]

Batch 40 saved to output_batch_40.parquet


Processing file: 347296013it [05:28, 2820325.81it/s]

Batch 41 saved to output_batch_41.parquet


Processing file: 354017456it [05:41, 2629852.94it/s]

Batch 42 saved to output_batch_42.parquet


Processing file: 362382405it [05:53, 2913291.23it/s]

Batch 43 saved to output_batch_43.parquet


Processing file: 368891972it [06:05, 2519434.68it/s]

Batch 44 saved to output_batch_44.parquet


Processing file: 375750969it [06:16, 2896615.03it/s]

Batch 45 saved to output_batch_45.parquet


Processing file: 383091380it [06:30, 2807881.64it/s]

Batch 46 saved to output_batch_46.parquet


Processing file: 391358816it [06:42, 2861242.68it/s]

Batch 47 saved to output_batch_47.parquet


Processing file: 401950951it [06:56, 2811471.96it/s]

Batch 48 saved to output_batch_48.parquet


Processing file: 411941109it [07:11, 2928553.77it/s]

Batch 49 saved to output_batch_49.parquet


Processing file: 423095473it [07:25, 2676358.44it/s]

Batch 50 saved to output_batch_50.parquet


Processing file: 437698430it [07:41, 2904748.98it/s]

Batch 51 saved to output_batch_51.parquet


Processing file: 439449662it [07:55, 390581.79it/s] 

Batch 52 saved to output_batch_52.parquet


Processing file: 443896203it [08:07, 911026.46it/s] 


Batch 53 saved to output_batch_53.parquet
All batches processed and saved.
