In [1]:
# First, make sure you have Polars installed:
# !pip install polars

import os, glob, time, threading
import numpy as np
import polars as pl
from concurrent.futures import ThreadPoolExecutor, as_completed

def _load_file_polars(fp_lbl):
    """
    Read one CSV with Polars (uses its own thread pool),
    extract 'magnitude' as a numpy float32 array.
    """
    fp, lbl = fp_lbl
    df = pl.read_csv(fp, columns=["magnitude"])
    return df["magnitude"].to_numpy(), lbl

def load_magnitude_data(base_dir, num_workers=None):
    """
    Notebook-compatible, fully parallel loader with 2-second status prints.
    
    Args:
      base_dir (str): root dir containing 'fake/' and 'real/' subfolders.
      num_workers (int, optional): max threads; defaults to all CPU cores.
    
    Returns:
      X (np.ndarray): stacked [n_samples, …] float32
      y (np.ndarray): [n_samples] int64 labels (0=fake,1=real)
    """
    cpu_cores = os.cpu_count() or 1
    workers = num_workers or cpu_cores

    # 1) discover files
    file_list = [
        (fp, lbl)
        for lbl, cls in enumerate(["fake", "real"])
        for fp in glob.glob(os.path.join(base_dir, cls, "*.csv"))
    ]
    total = len(file_list)
    print(f"[SETUP] Found {total} CSV files; using {workers} threads")

    # 2) progress printer
    count = 0
    stop_evt = threading.Event()
    def _printer():
        while not stop_evt.is_set():
            time.sleep(2)
            print(f"[PROGRESS] Loaded {count}/{total} files")
    threading.Thread(target=_printer, daemon=True).start()

    # 3) parallel load
    X_parts, y_parts = [], []
    t0 = time.time()
    with ThreadPoolExecutor(max_workers=workers) as exe:
        futures = {exe.submit(_load_file_polars, item): item for item in file_list}
        for future in as_completed(futures):
            arr, lbl = future.result()
            X_parts.append(arr)
            y_parts.append(lbl)
            count += 1

    # 4) wrap up
    stop_evt.set()
    elapsed = time.time() - t0
    X = np.vstack(X_parts)
    y = np.array(y_parts, dtype=np.int64)
    print(f"[DONE] {elapsed:.1f}s → X.shape={X.shape}, y.shape={y.shape}")
    return X, y

# Example usage in a Jupyter cell:
X_train, y_train = load_magnitude_data("./for-norm/for-norm/training", num_workers=None)


[SETUP] Found 53868 CSV files; using 20 threads
[PROGRESS] Loaded 2724/53868 files
[PROGRESS] Loaded 8507/53868 files
[PROGRESS] Loaded 14428/53868 files
[PROGRESS] Loaded 19858/53868 files
[PROGRESS] Loaded 24846/53868 files
[PROGRESS] Loaded 30111/53868 files
[PROGRESS] Loaded 32009/53868 files
[PROGRESS] Loaded 32316/53868 files
[PROGRESS] Loaded 32590/53868 files
[PROGRESS] Loaded 32896/53868 files
[PROGRESS] Loaded 33219/53868 files
[PROGRESS] Loaded 33509/53868 files
[PROGRESS] Loaded 33791/53868 files
[PROGRESS] Loaded 34116/53868 files
[PROGRESS] Loaded 34451/53868 files
[PROGRESS] Loaded 34762/53868 files
[PROGRESS] Loaded 35090/53868 files
[PROGRESS] Loaded 35387/53868 files
[PROGRESS] Loaded 35739/53868 files
[PROGRESS] Loaded 36085/53868 files
[PROGRESS] Loaded 36475/53868 files
[PROGRESS] Loaded 36854/53868 files
[PROGRESS] Loaded 37257/53868 files
[PROGRESS] Loaded 37626/53868 files
[PROGRESS] Loaded 38048/53868 files
[PROGRESS] Loaded 38449/53868 files
[PROGRESS] Loaded 

In [2]:
pip install polars


Defaulting to user installation because normal site-packages is not writeable
Collecting polars
  Downloading polars-1.29.0-cp39-abi3-win_amd64.whl.metadata (15 kB)
Downloading polars-1.29.0-cp39-abi3-win_amd64.whl (35.0 MB)
   ---------------------------------------- 0.0/35.0 MB ? eta -:--:--
    --------------------------------------- 0.5/35.0 MB 2.8 MB/s eta 0:00:13
   - -------------------------------------- 1.0/35.0 MB 2.4 MB/s eta 0:00:15
   - -------------------------------------- 1.6/35.0 MB 2.8 MB/s eta 0:00:12
   -- ------------------------------------- 2.6/35.0 MB 3.1 MB/s eta 0:00:11
   --- ------------------------------------ 3.4/35.0 MB 3.4 MB/s eta 0:00:10
   ----- ---------------------------------- 4.5/35.0 MB 3.7 MB/s eta 0:00:09
   ----- ---------------------------------- 5.2/35.0 MB 3.8 MB/s eta 0:00:08
   ------ --------------------------------- 6.0/35.0 MB 3.6 MB/s eta 0:00:08
   ------- -------------------------------- 6.3/35.0 MB 3.5 MB/s eta 0:00:09
   -------- 

ERROR: THESE PACKAGES DO NOT MATCH THE HASHES FROM THE REQUIREMENTS FILE. If you have updated the package versions, please update the hashes. Otherwise, examine the package contents carefully; someone may have tampered with them.
    unknown package:
        Expected sha256 f5aac4656e58b1e12f9481950981ef68b5b0e53dd4903bd72472efd2d09a74c8
             Got        cac9cc94c538117ec0062d325b2cd245e1e85770c5784f1126aa927cd014fc60

