In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import time
import os

In [2]:
# Replace 'your_dataset.csv' with the actual path to your CSV file.
csv_file = 'all_stocks_5yr.csv'
df = pd.read_csv(csv_file)
print("CSV data loaded successfully.")

CSV data loaded successfully.


## Convert and Save as Parquet (with Compression)

In [3]:
parquet_file = 'your_dataset_snappy.parquet'
df.to_parquet(parquet_file, compression='snappy', index=False)
print("Data successfully writt`en to Parquet with Snappy compression.")

Data successfully writt`en to Parquet with Snappy compression.


In [4]:
def simulate_scale(df, factor):
    """Duplicate the DataFrame 'factor' times."""
    return pd.concat([df] * factor, ignore_index=True)

In [5]:
results = [] 
scales = [1, 10, 100] 
scale_labels = ["1x", "10x", "100x"]

In [6]:
for factor, label in zip(scales, scale_labels):
    if factor == 1:
        df_scaled = df
    else:
        df_scaled = simulate_scale(df, factor)
    
    # Define temporary filenames for the scaled CSV and Parquet files.
    csv_filename = f'scaled_dataset_{label}.csv'
    parquet_filename = f'scaled_dataset_{label}_snappy.parquet'
    
    # --- CSV Write Benchmark ---
    start = time.time()
    df_scaled.to_csv(csv_filename, index=False)
    csv_write_time = time.time() - start
    
    # --- CSV Read Benchmark ---
    start = time.time()
    _ = pd.read_csv(csv_filename)
    csv_read_time = time.time() - start
    
    # --- Parquet Write Benchmark (using Snappy compression) ---
    start = time.time()
    df_scaled.to_parquet(parquet_filename, compression='snappy', index=False)
    parquet_write_time = time.time() - start
    
    # --- Parquet Read Benchmark ---
    start = time.time()
    _ = pd.read_parquet(parquet_filename)
    parquet_read_time = time.time() - start
 
    # --- Measure File Sizes (in MB) ---
    csv_size = os.path.getsize(csv_filename) / (1024 * 1024)
    parquet_size = os.path.getsize(parquet_filename) / (1024 * 1024)

    # Store the results in a dictionary.
    results.append({
        "Scale": label,
        "CSV Write Time (s)": round(csv_write_time, 2),
        "CSV Read Time (s)": round(csv_read_time, 2),
        "Parquet Write Time (s)": round(parquet_write_time, 2),
        "Parquet Read Time (s)": round(parquet_read_time, 2),
        "CSV Size (MB)": round(csv_size, 2),
        "Parquet Size (MB)": round(parquet_size, 2)
    })
os.remove(csv_filename)
os.remove(parquet_filename)

In [8]:
results_df = pd.DataFrame(results)
print("\nSummary of Benchmarks:")
results_df


Summary of Benchmarks:


Unnamed: 0,Scale,CSV Write Time (s),CSV Read Time (s),Parquet Write Time (s),Parquet Read Time (s),CSV Size (MB),Parquet Size (MB)
0,1x,3.34,0.55,0.53,0.28,28.8,10.15
1,10x,33.98,5.29,4.38,1.35,288.01,95.35
2,100x,333.29,55.64,44.04,15.01,2880.05,951.71


## Part A Summary:
> Faster Read/Write: Parquet significantly outperforms CSV in both read and write times, especially as dataset size increases.

> Smaller File Size: Parquet files are compressed and take up ∼35% of the space compared to CSV files at all scales.

> Better Scaling: Parquet's efficiency becomes more evident at larger scales (10× and 100×), with faster operations and reduced storage requirements.

> Recommendation: For large-scale datasets, Parquet is the better option for analytical workloads due to its speed and compression