In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import time
import os

In [2]:
# Replace 'your_dataset.csv' with the actual path to your CSV file.
csv_file = 'all_stocks_5yr.csv'
df = pd.read_csv(csv_file)
print("CSV data loaded successfully.")

CSV data loaded successfully.


## Convert and Save as Parquet (with Compression)

In [3]:
parquet_file = 'your_dataset_snappy.parquet'
df.to_parquet(parquet_file, compression='snappy', index=False)
print("The data has been successfully saved to a Parquet file with Snappy compression.")

The data has been successfully saved to a Parquet file with Snappy compression.


In [4]:
def simulate_duplicate(df, rows_to_add):
    """Append 'rows_to_add' rows to the DataFrame by duplicating the first rows."""
    new_rows = df.head(rows_to_add)
    return pd.concat([df, new_rows], ignore_index=True)

In [5]:
results = [] 
scales = [1, 10, 100] 
scale_labels = ["1x", "10x", "100x"]

In [6]:
def simulate_scale(df, factor):
    """Duplicate the DataFrame 'factor' times."""
    return pd.concat([df] * factor, ignore_index=True)

# Now the rest of your code:
for factor, label in zip(scales, scale_labels):
    if factor == 1:
        df_scaled = df
    else:
        df_scaled = simulate_scale(df, factor)
    
    # Generate temporary filenames for the scaled CSV and Parquet files
    csv_file = f'scaled_dataset_{label}.csv'
    parquet_file = f'scaled_dataset_{label}_snappy.parquet'
    
    # --- Measure CSV Write Performance ---
    start_time = time.time()
    df_scaled.to_csv(csv_file, index=False)
    csv_write_duration = time.time() - start_time
    
    # --- Measure CSV Read Performance ---
    start_time = time.time()
    _ = pd.read_csv(csv_file)
    csv_read_duration = time.time() - start_time
    
    # --- Measure Parquet Write Performance (using Snappy compression) ---
    start_time = time.time()
    df_scaled.to_parquet(parquet_file, compression='snappy', index=False)
    parquet_write_duration = time.time() - start_time
    
    # --- Measure Parquet Read Performance ---
    start_time = time.time()
    _ = pd.read_parquet(parquet_file)
    parquet_read_duration = time.time() - start_time
 
    # --- Calculate File Sizes (in MB) ---
    csv_file_size = os.path.getsize(csv_file) / (1024 * 1024)
    parquet_file_size = os.path.getsize(parquet_file) / (1024 * 1024)

    # Record the benchmark results in a dictionary
    results.append({
        "Scale": label,
        "CSV Write Duration (s)": round(csv_write_duration, 2),
        "CSV Read Duration (s)": round(csv_read_duration, 2),
        "Parquet Write Duration (s)": round(parquet_write_duration, 2),
        "Parquet Read Duration (s)": round(parquet_read_duration, 2),
        "CSV File Size (MB)": round(csv_file_size, 2),
        "Parquet File Size (MB)": round(parquet_file_size, 2)
    })

# Clean up the temporary files after benchmarking
os.remove(csv_file)
os.remove(parquet_file)

In [7]:
# Convert the results list to a DataFrame and print the summary.
results_df = pd.DataFrame(results)
print("\nBenchmark Results Summary:")
print(results_df)


Benchmark Results Summary:
  Scale  CSV Write Duration (s)  CSV Read Duration (s)  \
0    1x                    1.99                   0.32   
1   10x                   27.53                   4.20   
2  100x                  285.12                  60.58   

   Parquet Write Duration (s)  Parquet Read Duration (s)  CSV File Size (MB)  \
0                        0.28                       0.18               28.80   
1                        3.22                       0.94              288.01   
2                       44.56                      26.72             2880.05   

   Parquet File Size (MB)  
0                   10.15  
1                   95.35  
2                  951.67  


## Part A Summary:
> Faster Read/Write: Parquet significantly outperforms CSV in both read and write times, especially as dataset size increases.

> Smaller File Size: Parquet files are compressed and take up ∼35% of the space compared to CSV files at all scales.

> Better Scaling: Parquet's efficiency becomes more evident at larger scales (10× and 100×), with faster operations and reduced storage requirements.

> Recommendation: For large-scale datasets, Parquet is the better option for analytical workloads due to its speed and compression