In [3]:
# dataset_to_parquet.ipynb

import pandas as pd
from pathlib import Path

def convert_csv_to_parquet(csv_file, parquet_file, chunksize=250_000, engine="pyarrow", compression="snappy"):
    """
    Convert a large CSV to Parquet format (same directory as notebook).

    Parameters
    ----------
    csv_file : str or Path
        Input CSV filename.
    parquet_file : str or Path
        Output Parquet filename.
    chunksize : int, optional
        Number of rows per chunk for reading (to handle large files).
    engine : {"pyarrow","fastparquet"}, optional
        Parquet engine to use. Default "pyarrow".
    compression : {"snappy","gzip",None}, optional
        Parquet compression. Default "snappy".
    """
    csv_path = Path(csv_file)
    parquet_path = Path(parquet_file)

    if not csv_path.exists():
        print(f"⚠️ File not found: {csv_path}")
        return
    
    print(f"Converting {csv_path.name} → {parquet_path.name}  (chunksize={chunksize:,}, engine={engine}, compression={compression})")
    
    # Read in chunks to avoid memory issues
    chunks = []
    for chunk in pd.read_csv(csv_path, low_memory=False, chunksize=chunksize):
        chunks.append(chunk)
    df = pd.concat(chunks, ignore_index=True)
    
    # Save as Parquet
    df.to_parquet(parquet_path, index=False, engine=engine, compression=compression)
    print(f"✓ Done. Saved {parquet_path} (rows: {len(df):,})")

# Run conversions
targets = [
    ("player_performances.csv",         "player_performances.parquet"),
    ("player_injuries.csv",             "player_injuries.parquet"),
    ("player_national_performances.csv","player_national_performances.parquet"),
    ("player_teammates_played_with.csv","player_teammates_played_with.parquet"),
    ("player_profiles.csv","player_profiles.parquet")
]

for csv_file, parquet_file in targets:
    convert_csv_to_parquet(csv_file, parquet_file)

Converting player_performances.csv → player_performances.parquet  (chunksize=250,000, engine=pyarrow, compression=snappy)
✓ Done. Saved player_performances.parquet (rows: 760,125)
Converting player_injuries.csv → player_injuries.parquet  (chunksize=250,000, engine=pyarrow, compression=snappy)
✓ Done. Saved player_injuries.parquet (rows: 77,871)
Converting player_national_performances.csv → player_national_performances.parquet  (chunksize=250,000, engine=pyarrow, compression=snappy)
✓ Done. Saved player_national_performances.parquet (rows: 62,370)
Converting player_teammates_played_with.csv → player_teammates_played_with.parquet  (chunksize=250,000, engine=pyarrow, compression=snappy)
✓ Done. Saved player_teammates_played_with.parquet (rows: 681,279)
Converting player_profiles.csv → player_profiles.parquet  (chunksize=250,000, engine=pyarrow, compression=snappy)
✓ Done. Saved player_profiles.parquet (rows: 40,738)
