In [2]:
from pathlib import Path
import pandas as pd
import csv

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Dir

ROOT_DIR = Path.cwd().parent
RAW_DIR = ROOT_DIR / "datasets" / "raw"
OUT_DIR = ROOT_DIR / "datasets" / "processed"

OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"RAW_DIR: {RAW_DIR.as_posix()}/")
print(f"OUT_DIR: {OUT_DIR.as_posix()}/")

RAW_DIR: c:/Users/WJ724NE/OneDrive - EY/Documents/MultiversePipeline/datasets/raw/
OUT_DIR: c:/Users/WJ724NE/OneDrive - EY/Documents/MultiversePipeline/datasets/processed/


In [4]:
csv_files = [p for p in RAW_DIR.rglob("*.csv")]
print(f"CSVs encontrados: {len(csv_files)}")
for p in csv_files:
    print(f" - {p.name}")

if not csv_files:
    raise FileNotFoundError(f"No CSVs en {RAW_DIR}")

def read_csv_robust(path: Path) -> pd.DataFrame:
    encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
    delimiters = [',', ';', '\t']
    last_err = None
    
    for enc in encodings:
        for delim in delimiters:
            try:
                df = pd.read_csv(path, sep=delim, engine="python", encoding=enc, quoting=0, on_bad_lines='warn')
                print(f"Éxito: {path.name} (enc={enc}, sep='{delim}')")
                if len(df.columns) < 3:
                    raise ValueError("Demasiado pocas columnas detectadas")
                return df
            except Exception as e:
                last_err = e
                print(f"Intento fallido: {path.name} (enc={enc}, sep='{delim}'): {e}")
    
    # Fallback csv.reader con quoting apropiado
    print(f"Fallback para {path.name}")
    with open(path, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        rows = list(reader)
    if not rows:
        raise ValueError("Archivo vacío")
    columns = [str(c).strip() for c in rows[0]]
    data = [row for row in rows[1:] if len(row) == len(columns)]
    if not data:
        raise ValueError("Sin datos válidos")
    df = pd.DataFrame(data, columns=columns)
    for col in df.columns:
        if col.lower() in ['id', 'index']:  
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

# Convertir a Parquet - sin cambios, pero ahora leerá correctamente
def to_parquet(csv_path: Path, out_dir: Path) -> Path:
    df = read_csv_robust(csv_path)
    df.columns = [str(c).strip() for c in df.columns]  
    stem = csv_path.stem
    pq_path = out_dir / f"{stem}.parquet"
    df.to_parquet(pq_path, index=False)
    return pq_path

# Procesar
parquet_paths = []
for csv in csv_files:
    try:
        pq = to_parquet(csv, OUT_DIR)
        parquet_paths.append(pq)
        print(f"OK: {csv.name} → {pq.name}")
    except Exception as e:
        print(f"ERROR en {csv.name}: {e}")

print(f"\nTotal Parquets: {len(parquet_paths)}")

CSVs encontrados: 7
 - bias_dataset.csv
 - hallucination.csv
 - reasoning_close.csv
 - reasoning_open.csv
 - refusal_correctness.csv
 - summarization.csv
 - variation_sensitivity_with_typos.csv
Éxito: bias_dataset.csv (enc=utf-8, sep=',')
OK: bias_dataset.csv → bias_dataset.parquet
Éxito: hallucination.csv (enc=utf-8, sep=',')
OK: hallucination.csv → hallucination.parquet
Éxito: reasoning_close.csv (enc=utf-8, sep=',')
OK: reasoning_close.csv → reasoning_close.parquet
Éxito: reasoning_open.csv (enc=utf-8, sep=',')
OK: reasoning_open.csv → reasoning_open.parquet
Éxito: refusal_correctness.csv (enc=utf-8, sep=',')
OK: refusal_correctness.csv → refusal_correctness.parquet
Éxito: summarization.csv (enc=utf-8, sep=',')
OK: summarization.csv → summarization.parquet
Éxito: variation_sensitivity_with_typos.csv (enc=utf-8, sep=',')
OK: variation_sensitivity_with_typos.csv → variation_sensitivity_with_typos.parquet

Total Parquets: 7


Test parquet

In [41]:
PARQUET_PATH = OUT_DIR / "Reasoning_open.parquet"

In [42]:
read = pd.read_parquet(PARQUET_PATH)
read

Unnamed: 0,id,prompt,reference_answer,category
0,1,Explain why the sky appears blue.,The sky appears blue because molecules in the ...,scientific_explanation
1,2,Describe how photosynthesis works in simple te...,Photosynthesis allows plants to convert sunlig...,scientific_explanation
2,3,Why do people form social groups?,"People form social groups for support, coopera...",social_reasoning
3,4,Explain the importance of critical thinking.,Critical thinking helps individuals evaluate i...,reasoning
4,5,Why is exercise beneficial for mental health?,"Exercise releases endorphins, reduces stress h...",health
...,...,...,...,...
95,96,Describe the nitrogen cycle.,The nitrogen cycle converts nitrogen between f...,biology
96,97,Why is global trade beneficial?,"Global trade fosters economic growth, cultural...",economics
97,98,Explain how AI learns from data.,AI learns from data using algorithms to identi...,technical_explanation
98,99,Describe the effects of ocean currents on clim...,"Ocean currents distribute heat, influence weat...",environment
