Libraries

In [2]:
from pathlib import Path
import pandas as pd
import csv

import warnings
warnings.filterwarnings('ignore')

Routes

In [None]:
ROOT_DIR = Path.cwd().parent
RAW_DIR = ROOT_DIR / "datasets" / "raw"
OUT_DIR = ROOT_DIR / "datasets" / "processed"

OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"RAW_DIR: {RAW_DIR.as_posix()}/")
print(f"OUT_DIR: {OUT_DIR.as_posix()}/")

RAW_DIR: c:/Users/WJ724NE/OneDrive - EY/Documents/MultiversePipeline/datasets/raw/
OUT_DIR: c:/Users/WJ724NE/OneDrive - EY/Documents/MultiversePipeline/datasets/processed/


Function to convert .csv to .parquet

In [35]:
def read_csv_robust(path: Path) -> pd.DataFrame:
    encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
    delimiters = [',', ';', '\t']
    last_err = None
    
    for enc in encodings:
        for delim in delimiters:
            try:
                df = pd.read_csv(path, sep=delim, engine="python", encoding=enc, quoting=0, on_bad_lines='warn')
                print(f"Éxito: {path.name} (enc={enc}, sep='{delim}')")
                if len(df.columns) < 3:
                    raise ValueError("Demasiado pocas columnas detectadas")
                return df
            except Exception as e:
                last_err = e
                print(f"Intento fallido: {path.name} (enc={enc}, sep='{delim}'): {e}")
    
    # Fallback
    print(f"Fallback para {path.name}")
    with open(path, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        rows = list(reader)
    if not rows:
        raise ValueError("Archivo vacío")
    columns = [str(c).strip() for c in rows[0]]
    data = [row for row in rows[1:] if len(row) == len(columns)]
    if not data:
        raise ValueError("Sin datos válidos")
    df = pd.DataFrame(data, columns=columns)
    for col in df.columns:
        if col.lower() in ['id', 'index']:  
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df


# transformar variation_sensitivity_with_typos

def transform_variation_sensitivity(df: pd.DataFrame, csv_path: Path) -> pd.DataFrame:
    """
    Convierte el CSV con columnas:
    id, category, difficulty, prompt_original, prompt_paraphrase, prompt_typo_variant
    → 3 filas por ejemplo con columnas: id, prompt, group_id, variation_type, category, difficulty
    """
    print(f"Transformando {csv_path.name} → formato largo (3 filas por ejemplo)")

    # Limpieza básica
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    df["id"] = df["id"].astype(str)

    required_cols = ["prompt_original", "prompt_paraphrase", "prompt_typo_variant"]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise KeyError(f"Faltan columnas requeridas en {csv_path.name}: {missing}")

    records = []
    for _, row in df.iterrows():
        base_id = str(row["id"])

        # Original
        records.append({
            "id": f"{base_id}_orig",
            "prompt": str(row["prompt_original"]).strip(),
            "group_id": base_id,
            "variation_type": "original",
            "category": row.get("category", ""),
            "difficulty": row.get("difficulty", ""),
        })

        # Paraphrase
        records.append({
            "id": f"{base_id}_para",
            "prompt": str(row["prompt_paraphrase"]).strip(),
            "group_id": base_id,
            "variation_type": "paraphrase",
            "category": row.get("category", ""),
            "difficulty": row.get("difficulty", ""),
        })

        # Typo variant
        records.append({
            "id": f"{base_id}_typo",
            "prompt": str(row["prompt_typo_variant"]).strip(),
            "group_id": base_id,
            "variation_type": "typo",
            "category": row.get("category", ""),
            "difficulty": row.get("difficulty", ""),
        })

    long_df = pd.DataFrame(records)
    print(f"→ {len(df)} ejemplos originales → {len(long_df)} filas (x3)")
    return long_df


def to_parquet(csv_path: Path, out_dir: Path) -> Path:
    df = read_csv_robust(csv_path)
    df.columns = [str(c).strip() for c in df.columns]

    stem = csv_path.stem

    # Detección especial para variation_sensitivity_with_typos
    if "variation_sensitivity_with_typos" in stem.lower():
        df = transform_variation_sensitivity(df, csv_path)
        out_name = "variation_sensitivity.parquet"   # nombre fijo y limpio
    else:
        # Comportamiento normal para el resto de datasets
        out_name = f"{stem}.parquet"

    # mantiene el nombre original

    pq_path = out_dir / out_name
    df.to_parquet(pq_path, index=False)
    print(f"OK: {csv_path.name} → {pq_path.name}")
    return pq_path

if __name__ == "__main__":
    csv_files = [p for p in RAW_DIR.rglob("*.csv")]
    print(f"CSVs encontrados: {len(csv_files)}")
    for p in csv_files:
        print(f" - {p.name}")

    if not csv_files:
        raise FileNotFoundError(f"No CSVs en {RAW_DIR}")

    parquet_paths = []
    for csv in csv_files:
        try:
            pq = to_parquet(csv, OUT_DIR)
            parquet_paths.append(pq)
        except Exception as e:
            print(f"ERROR en {csv.name}: {e}")

    print(f"\nTotal Parquets generados: {len(parquet_paths)}")
    for p in parquet_paths:
        print(f"   • {p.name}")

CSVs encontrados: 7
 - bias_dataset.csv
 - hallucination.csv
 - reasoning_close.csv
 - reasoning_open.csv
 - refusal_correctness.csv
 - summarization.csv
 - variation_sensitivity_with_typos.csv
Éxito: bias_dataset.csv (enc=utf-8, sep=',')
OK: bias_dataset.csv → bias_dataset.parquet
Éxito: hallucination.csv (enc=utf-8, sep=',')
OK: hallucination.csv → hallucination.parquet
Éxito: reasoning_close.csv (enc=utf-8, sep=',')
OK: reasoning_close.csv → reasoning_close.parquet
Éxito: reasoning_open.csv (enc=utf-8, sep=',')
OK: reasoning_open.csv → reasoning_open.parquet
Éxito: refusal_correctness.csv (enc=utf-8, sep=',')
OK: refusal_correctness.csv → refusal_correctness.parquet
Éxito: summarization.csv (enc=utf-8, sep=',')
OK: summarization.csv → summarization.parquet
Éxito: variation_sensitivity_with_typos.csv (enc=utf-8, sep=',')
Transformando variation_sensitivity_with_typos.csv → formato largo (3 filas por ejemplo)
→ 100 ejemplos originales → 300 filas (x3)
OK: variation_sensitivity_with_ty

Test parquets

- The aim of this section is to have a space to check the .parquet files.

In [4]:
read_outputs = pd.read_parquet(r'C:\Users\WJ724NE\OneDrive - EY\Documents\MultiversePipeline\outputs\obj_scores.parquet')
read_outputs

Unnamed: 0,model,reasoning_close_avg,reasoning_close_delta,reasoning_open_avg,reasoning_open_delta,refusal_correctness_avg,refusal_correctness_delta,summarization_avg,summarization_delta,variation_sensitivity_avg,variation_sensitivity_delta
0,cai-llama-3-1-8b-slim,0.8,0.4,0.9266,0.0133,1.0,0.0,0.6867,0.0375,0.9657,0.0228
1,gpt-4o,1.0,0.0,0.9281,0.0176,1.0,0.0,0.7106,0.0336,0.9699,0.0251


In [None]:
PARQUET_PATH = OUT_DIR / "summarization.parquet"

read = pd.read_parquet(PARQUET_PATH)
read

Unnamed: 0,id,category,prompt,reference_summary
0,1,News / Current affairs,As artificial intelligence (AI) technologies c...,The increasing integration of artificial intel...
1,2,News / Current affairs,"In recent months, education reform has emerged...",Education reform proposals are focusing on add...
2,3,News / Current affairs,"In recent years, the increasing frequency and ...",Governments are reevaluating cybersecurity leg...
3,4,News / Current affairs,As wildfires continue to ravage various region...,Wildfire response efforts are intensifying as ...
4,5,News / Current affairs,Inflation has emerged as a significant concern...,Inflation is significantly affecting consumers...
...,...,...,...,...
95,96,General Knowledge,Food webs are intricate networks that illustra...,Food webs illustrate the feeding relationships...
96,97,General Knowledge,"The continental drift theory, first proposed b...","The continental drift theory, proposed by Alfr..."
97,98,General Knowledge,Black holes are regions in space where the gra...,Black holes are regions in space with gravitat...
98,99,General Knowledge,Probability is a branch of mathematics that de...,Probability is a mathematical framework for qu...
