In [1]:
from pathlib import Path
import pandas as pd
import csv

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Dir

ROOT_DIR = Path.cwd().parent
RAW_DIR = ROOT_DIR / "datasets" / "raw"
OUT_DIR = ROOT_DIR / "datasets" / "processed"

OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"RAW_DIR: {RAW_DIR.as_posix()}/")
print(f"OUT_DIR: {OUT_DIR.as_posix()}/")

RAW_DIR: c:/Users/WJ724NE/OneDrive - EY/Documents/MultiversePipeline/datasets/raw/
OUT_DIR: c:/Users/WJ724NE/OneDrive - EY/Documents/MultiversePipeline/datasets/processed/


In [4]:
csv_files = [p for p in RAW_DIR.rglob("*.csv")]
print(f"CSVs encontrados: {len(csv_files)}")
for p in csv_files:
    print(f" - {p.name}")

if not csv_files:
    raise FileNotFoundError(f"No CSVs en {RAW_DIR}")

def read_csv_robust(path: Path) -> pd.DataFrame:
    encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
    delimiters = [',', ';', '\t']
    last_err = None
    
    for enc in encodings:
        for delim in delimiters:
            try:
                df = pd.read_csv(path, sep=delim, engine="python", encoding=enc, quoting=0, on_bad_lines='warn')
                print(f"Éxito: {path.name} (enc={enc}, sep='{delim}')")
                if len(df.columns) < 3:
                    raise ValueError("Demasiado pocas columnas detectadas")
                return df
            except Exception as e:
                last_err = e
                print(f"Intento fallido: {path.name} (enc={enc}, sep='{delim}'): {e}")
    
    # Fallback csv.reader con quoting apropiado
    print(f"Fallback para {path.name}")
    with open(path, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        rows = list(reader)
    if not rows:
        raise ValueError("Archivo vacío")
    columns = [str(c).strip() for c in rows[0]]
    data = [row for row in rows[1:] if len(row) == len(columns)]
    if not data:
        raise ValueError("Sin datos válidos")
    df = pd.DataFrame(data, columns=columns)
    for col in df.columns:
        if col.lower() in ['id', 'index']:  
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

# Convertir a Parquet - sin cambios, pero ahora leerá correctamente
def to_parquet(csv_path: Path, out_dir: Path) -> Path:
    df = read_csv_robust(csv_path)
    df.columns = [str(c).strip() for c in df.columns]  
    stem = csv_path.stem
    pq_path = out_dir / f"{stem}.parquet"
    df.to_parquet(pq_path, index=False)
    return pq_path

# Procesar
parquet_paths = []
for csv in csv_files:
    try:
        pq = to_parquet(csv, OUT_DIR)
        parquet_paths.append(pq)
        print(f"OK: {csv.name} → {pq.name}")
    except Exception as e:
        print(f"ERROR en {csv.name}: {e}")

print(f"\nTotal Parquets: {len(parquet_paths)}")

CSVs encontrados: 7
 - Bias_Dataset_.csv
 - Hallucination.csv
 - Reasoning_Close.csv
 - Reasoning_Open.csv
 - Refusal_Correctness_.csv
 - Summarization.csv
 - Variation_sensitivity_with_typos.csv
Éxito: Bias_Dataset_.csv (enc=utf-8, sep=',')
OK: Bias_Dataset_.csv → Bias_Dataset_.parquet
Éxito: Hallucination.csv (enc=utf-8, sep=',')
OK: Hallucination.csv → Hallucination.parquet
Éxito: Reasoning_Close.csv (enc=utf-8, sep=',')
OK: Reasoning_Close.csv → Reasoning_Close.parquet
Éxito: Reasoning_Open.csv (enc=utf-8, sep=',')
OK: Reasoning_Open.csv → Reasoning_Open.parquet
Éxito: Refusal_Correctness_.csv (enc=utf-8, sep=',')
OK: Refusal_Correctness_.csv → Refusal_Correctness_.parquet
Éxito: Summarization.csv (enc=utf-8, sep=',')
OK: Summarization.csv → Summarization.parquet
Éxito: Variation_sensitivity_with_typos.csv (enc=utf-8, sep=',')
OK: Variation_sensitivity_with_typos.csv → Variation_sensitivity_with_typos.parquet

Total Parquets: 7


Auditoría de métricas [Despues se eliminara]

In [1]:
import os
os.environ["DEEPEVAL_DISABLE_ANALYTICS"] = "true"

In [None]:
import os
import sys
import types
import requests as _requests


os.environ["DEEPEVAL_HOME"] = "/does/not/exist"
os.environ["DEEPEVAL_DISABLE_ANALYTICS"] = "true"


fake_analytics = types.ModuleType("deepeval.analytics")

def no_op(*args, **kwargs):
    return None

fake_analytics.upload = no_op
fake_analytics.log = no_op
fake_analytics.posthog = no_op

sys.modules["deepeval.analytics"] = fake_analytics

_original_post = _requests.post
def block_posthog(url, *args, **kwargs):
    if "posthog" in url:
        return None
    return _original_post(url, *args, **kwargs)

_requests.post = block_posthog

print("DeepEval telemetry disabled.")

DeepEval telemetry disabled.


In [6]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ExactMatchMetric

In [None]:
import unicodedata
import string

def compute_exact_match(predictions, references, use_deepeval=True):
    
    # ==========================
    # Normalización mejorada
    # ==========================
    def normalize(s: str) -> str:
        if s is None:
            return ""
        
        # 1. Pasar todo a str por si es None o numérico
        s = str(s)
        
        # 2. Normalizar Unicode (añade consistencia)
        s = unicodedata.normalize("NFKD", s)
        
        # 3. Pasar a minúsculas
        s = s.lower()
        
        # 4. Quitar acentos (diacríticos)
        s = "".join(c for c in s if not unicodedata.combining(c))
        
        # 5. Reemplazar saltos de línea por espacios
        s = s.replace("\n", " ").replace("\r", " ")
        
        # 6. Eliminar puntuación
        s = "".join(c for c in s if c not in string.punctuation)
        
        # 7. Quitar espacios múltiples
        s = " ".join(s.split())
        
        return s
    

    # ==========================
    # Opción DeepEval
    # ==========================
    if use_deepeval:
        scores = []
        
        for pred, ref in zip(predictions, references):
            metric = ExactMatchMetric(threshold=1.0)
            
            pred_norm = normalize(pred)
            ref_norm  = normalize(ref)

            tc = LLMTestCase(
                input="",
                actual_output=pred_norm,
                expected_output=ref_norm
            )
            
            metric.measure(tc)
            scores.append(metric.score)

        return scores

    # ==========================
    # Fallback mejorado (sin DeepEval)
    # ==========================
    else:
        out = []
        for p, r in zip(predictions, references):
            p_norm = normalize(p)
            r_norm = normalize(r)
            out.append(1.0 if p_norm == r_norm else 0.0)
        return out


In [None]:
import unicodedata
import string
from deepeval.metrics import ExactMatchMetric
from deepeval.test_case import LLMTestCase

def compute_exact_match(predictions, references):
    """
    Exact Match usando únicamente DeepEval,
    pero con normalización avanzada para evitar fallos por:
    - mayúsculas/minúsculas
    - acentos
    - puntuación
    - saltos de línea
    - espacios múltiples
    - unicode inconsistente
    """

    def normalize(s: str) -> str:
        if s is None:
            return ""
        
        s = str(s)

        # Normalizar unicode (muy importante para casos como "mañana")
        s = unicodedata.normalize("NFKD", s)

        # Minúsculas
        s = s.lower()

        # Quitar acentos (diacríticos)
        s = "".join(c for c in s if not unicodedata.combining(c))

        # Sustituir saltos de línea por espacios
        s = s.replace("\n", " ").replace("\r", " ")

        # Eliminar toda puntuación
        s = "".join(c for c in s if c not in string.punctuation)

        # Eliminar espacios múltiples
        s = " ".join(s.split())

        return s

    scores = []

    for pred, ref in zip(predictions, references):

        pred_norm = normalize(pred)
        ref_norm  = normalize(ref)

        metric = ExactMatchMetric(threshold=1.0)

        tc = LLMTestCase(
            input="",
            actual_output=pred_norm,
            expected_output=ref_norm
        )

        metric.measure(tc)
        scores.append(metric.score) 

    return scores

In [None]:
compute_exact_match(["La pelota es  de color verde"], ["La pelota es de color roja"])

[0.0]

error uploading: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1032)')))
error uploading: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1032)')))


In [19]:
compute_exact_match(["Hola mundo!"], ["Hola mundo"])

[1.0]

In [20]:
# min mayus

compute_exact_match(["hola mundo"], ["Hola mundo"])

[1.0]

error uploading: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1032)')))


In [4]:
# Lectura del archivo output 

read = pd.read_parquet(r'C:\Users\WJ724NE\OneDrive - EY\Documents\MultiversePipeline\outputs\obj_scores.parquet')
read.head(20)

Unnamed: 0,model,reasoning_open_avg,reasoning_open_delta
0,o4-mini,0.407954,0.158733


Test parquet

In [41]:
PARQUET_PATH = OUT_DIR / "Reasoning_open.parquet"

In [42]:
read = pd.read_parquet(PARQUET_PATH)
read

Unnamed: 0,id,prompt,reference_answer,category
0,1,Explain why the sky appears blue.,The sky appears blue because molecules in the ...,scientific_explanation
1,2,Describe how photosynthesis works in simple te...,Photosynthesis allows plants to convert sunlig...,scientific_explanation
2,3,Why do people form social groups?,"People form social groups for support, coopera...",social_reasoning
3,4,Explain the importance of critical thinking.,Critical thinking helps individuals evaluate i...,reasoning
4,5,Why is exercise beneficial for mental health?,"Exercise releases endorphins, reduces stress h...",health
...,...,...,...,...
95,96,Describe the nitrogen cycle.,The nitrogen cycle converts nitrogen between f...,biology
96,97,Why is global trade beneficial?,"Global trade fosters economic growth, cultural...",economics
97,98,Explain how AI learns from data.,AI learns from data using algorithms to identi...,technical_explanation
98,99,Describe the effects of ocean currents on clim...,"Ocean currents distribute heat, influence weat...",environment
