In [1]:
import pandas as pd
import os
import time

csv_file = "all_stocks_5yr.csv"

data = pd.read_csv(csv_file)

In [3]:
def scale_dataset(df, factor):
    """
    Expands the dataset by duplicating its contents the specified number of times.
    """
    return pd.concat([df.copy() for _ in range(factor)], ignore_index=True)

# Scaling factors and corresponding labels
scales = [1, 10, 100]
scale_labels = ["1x", "10x", "100x"]
benchmark_results = []

# Conduct benchmark tests
for factor, label in zip(scales, scale_labels):
    expanded_data = data if factor == 1 else scale_dataset(data, factor)
    
    # Define file paths
    csv_path = f"expanded_data_{label}.csv"
    parquet_path = f"expanded_data_{label}.parquet"
    
    # Benchmark CSV write operation
    start = time.time()
    expanded_data.to_csv(csv_path, index=False)
    csv_write_time = time.time() - start
    
    # Benchmark CSV read operation
    start = time.time()
    pd.read_csv(csv_path)
    csv_read_time = time.time() - start
    
    # Benchmark Parquet write operation
    start = time.time()
    expanded_data.to_parquet(parquet_path, engine="pyarrow", compression="snappy", index=False)
    parquet_write_time = time.time() - start
    
    # Benchmark Parquet read operation
    start = time.time()
    pd.read_parquet(parquet_path, engine="pyarrow")
    parquet_read_time = time.time() - start
    
    # Get file sizes in MB
    csv_size = os.path.getsize(csv_path) / (1024 * 1024)
    parquet_size = os.path.getsize(parquet_path) / (1024 * 1024)
    
    # Store benchmark results
    benchmark_results.append({
        "Scale": label,
        "CSV Write Time (s)": round(csv_write_time, 2),
        "CSV Read Time (s)": round(csv_read_time, 2),
        "Parquet Write Time (s)": round(parquet_write_time, 2),
        "Parquet Read Time (s)": round(parquet_read_time, 2),
        "CSV File Size (MB)": round(csv_size, 2),
        "Parquet File Size (MB)": round(parquet_size, 2)
    })

# Display results in DataFrame format
benchmark_df = pd.DataFrame(benchmark_results)
print(benchmark_df)

  Scale  CSV Write Time (s)  CSV Read Time (s)  Parquet Write Time (s)  \
0    1x                2.22               0.39                    0.30   
1   10x               21.43               2.82                    2.52   
2  100x              225.40              38.81                   27.54   

   Parquet Read Time (s)  CSV File Size (MB)  Parquet File Size (MB)  
0                   0.11               28.80                   10.15  
1                   0.63              288.01                   95.41  
2                   7.02             2880.05                  951.91  


# Data Format Performance Comparison

This table compares the performance of CSV and Parquet formats based on various metrics (Read Time, Write Time, and File Size) at different scaling levels (1x, 10x, and 100x).

| Scaling | CSV Write Time (s) | CSV Read Time (s) | Parquet Write Time (s) | Parquet Read Time (s) | CSV File Size (MB) | Parquet File Size (MB) |
|---------|--------------------|-------------------|------------------------|----------------------|--------------------|-----------------------|
| 1x      | 2.22               | 0.39              | 0.30                   | 0.11                 | 28.80              | 10.15                 |
| 10x     | 21.43              | 2.82              | 2.52                   | 0.63                 | 288.01             | 95.41                 |
| 100x    | 225.40             | 38.81             | 27.54                  | 7.02                 | 2880.05            | 951.91                |

## Observations:

### Write Time:
- **CSV** write time increases significantly with scaling:
  - From 2.22s at 1x to 225.40s at 100x.
- **Parquet** write time also increases, but at a slower rate:
  - From 0.30s at 1x to 27.54s at 100x.

### Read Time:
- **CSV** read time grows as well:
  - From 0.39s at 1x to 38.81s at 100x.
- **Parquet** read time increases slower:
  - From 0.11s at 1x to 7.02s at 100x.

### File Size:
- **CSV** file sizes increase substantially:
  - From 28.80MB at 1x to 2880.05MB at 100x.
- **Parquet** file sizes also grow, but at a lower rate:
  - From 10.15MB at 1x to 951.91MB at 100x.

### Conclusion:
- **Parquet** is more efficient than **CSV** in terms of both **file size** and **read time**, especially at larger scales.
- However, **CSV** file sizes grow much faster, and **read/write times** become significantly slower as the dataset scales.
