In [1]:
# dataset_to_parquet.ipynb

import pandas as pd
from pathlib import Path

# Define file list (CSV → Parquet, same folder as notebook)
files = [
    ("appearances.csv", "appearances.parquet"),
    ("players.csv", "players.parquet"),
]

def convert_csv_to_parquet(csv_file, parquet_file, chunksize=250_000):
    """
    Convert a large CSV to Parquet format (same directory as notebook).
    
    Parameters
    ----------
    csv_file : str or Path
        Input CSV filename.
    parquet_file : str or Path
        Output Parquet filename.
    chunksize : int, optional
        Number of rows per chunk for reading (to handle large files).
    """
    csv_path = Path(csv_file)
    parquet_path = Path(parquet_file)

    if not csv_path.exists():
        print(f"⚠️ File not found: {csv_path}")
        return
    
    print(f"Converting {csv_path.name} → {parquet_path.name}")
    
    # Read in chunks to avoid memory issues
    chunks = []
    for chunk in pd.read_csv(csv_path, low_memory=False, chunksize=chunksize):
        chunks.append(chunk)
    df = pd.concat(chunks, ignore_index=True)
    
    # Save as Parquet
    df.to_parquet(parquet_path, index=False)
    print(f"✓ Done. Saved {parquet_path} (rows: {len(df):,})")

# Run conversions
for csv_file, parquet_file in files:
    convert_csv_to_parquet(csv_file, parquet_file)

Converting appearances.csv → appearances.parquet
✓ Done. Saved appearances.parquet (rows: 1,706,806)
Converting players.csv → players.parquet
✓ Done. Saved players.parquet (rows: 32,601)
