In [3]:
# Importaciones necesarias
from datasets import load_dataset, get_dataset_config_names
import pandas as pd
import os
from subprocess import Popen, PIPE, STDOUT

# Importaciones de Haystack 2.x (necesarias para la Fase 2)
try:
    # Para Haystack 2.x - requiere haystack-integrations
    from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
    print("Usando Haystack 2.x")
except ImportError:
    try:
        # Para Haystack 1.x (fallback)
        from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
        print("Usando Haystack 1.x")
    except ImportError:
        print("‚ö†Ô∏è  ElasticsearchDocumentStore no disponible. Continuando sin √©l...")
        ElasticsearchDocumentStore = None

print("--- FASE 1: Exploraci√≥n y Preparaci√≥n de Datos (SQuAD) ---")

# 1.1 Cargar el dataset SQuAD (Stanford Question Answering Dataset)
print("Cargando dataset SQuAD desde Hugging Face...")
try:
    # Cargar SQuAD v1.1 - versi√≥n est√°ndar
    squad = load_dataset("rajpurkar/squad")
    print("‚úÖ SQuAD v1.1 cargado exitosamente")
    dataset_version = "SQuAD v1.1"
except Exception as e:
    print(f"Error cargando SQuAD v1.1: {e}")
    try:
        # Intentar SQuAD v2.0 como alternativa
        squad = load_dataset("rajpurkar/squad_v2")
        print("‚úÖ SQuAD v2.0 cargado exitosamente")
        dataset_version = "SQuAD v2.0"
    except Exception as e2:
        print(f"Error cargando ambas versiones: {e2}")
        raise

# 1.2 Informaci√≥n sobre el dataset
print(f"\nDataset: {dataset_version}")
print(f"Splits disponibles: {list(squad.keys())}")

# 1.3 Aplanar columnas anidadas y convertir a DataFrames
# SQuAD ya tiene una estructura plana m√°s simple que SubjQA
dfs = {}
for split_name, split_data in squad.items():
    dfs[split_name] = split_data.to_pandas()

df_train = dfs["train"]
df_validation = dfs["validation"]

# 1.4 Mostrar el tama√±o y la distribuci√≥n de los splits
print(f"\nN√∫mero de ejemplos en train: {len(df_train)}")
print(f"N√∫mero de ejemplos en validation: {len(df_validation)}")
print(f"N√∫mero total de ejemplos: {len(df_train) + len(df_validation)}")

# 1.5 Explorar la estructura del dataset
print(f"\nColumnas disponibles: {list(df_train.columns)}")

# 1.6 Definici√≥n de columnas clave para QA extractiva
# SQuAD tiene una estructura ligeramente diferente a SubjQA
print("\n--- Estructura de SQuAD ---")
sample_row = df_train.iloc[0]
print(f"ID: {sample_row['id']}")
print(f"T√≠tulo: {sample_row['title']}")
print(f"Context: {sample_row['context'][:200]}...")
print(f"Pregunta: {sample_row['question']}")
print(f"Respuestas: {sample_row['answers']}")

# 1.7 Procesamiento de respuestas (SQuAD tiene m√∫ltiples respuestas posibles)
# Extraer la primera respuesta y su posici√≥n
def extract_first_answer(answers_dict):
    """Extrae la primera respuesta y su posici√≥n de inicio"""
    if len(answers_dict['text']) > 0:
        return {
            'answer_text': answers_dict['text'][0],
            'answer_start': answers_dict['answer_start'][0]
        }
    return {'answer_text': '', 'answer_start': -1}

# Crear columnas simplificadas para compatibilidad con el c√≥digo original
df_train['answers.text'] = df_train['answers'].apply(lambda x: extract_first_answer(x)['answer_text'])
df_train['answers.answer_start'] = df_train['answers'].apply(lambda x: extract_first_answer(x)['answer_start'])

df_validation['answers.text'] = df_validation['answers'].apply(lambda x: extract_first_answer(x)['answer_text'])
df_validation['answers.answer_start'] = df_validation['answers'].apply(lambda x: extract_first_answer(x)['answer_start'])

# Actualizar el diccionario dfs
dfs['train'] = df_train
dfs['validation'] = df_validation

# 1.8 Definir columnas clave compatibles con el an√°lisis original
qa_cols = ["title", "question", "answers.text", "answers.answer_start", "context"]

# 1.9 Muestra de la estructura de un ejemplo de entrenamiento (Ejemplo de QA extractiva)
print("\n--- Ejemplo de Estructura de QA Extractiva (SQuAD) ---")
sample_df = df_train[qa_cols].sample(1, random_state=42)
pd.set_option('display.max_colwidth', 150)
print(sample_df.to_string())

# 1.10 Verificaci√≥n de la posici√≥n del √≠ndice de inicio
# Usamos el primer ejemplo de la muestra
start_idx = sample_df["answers.answer_start"].iloc[0]
if start_idx >= 0:  # Verificar que hay una respuesta v√°lida
    end_idx = start_idx + len(sample_df["answers.text"].iloc[0])
    respuesta_verificada = sample_df["context"].iloc[0][start_idx:end_idx]

    print(f"\nVerificaci√≥n: El √≠ndice de inicio es {start_idx}.")
    print(f"El texto extra√≠do usando ese √≠ndice es: '{respuesta_verificada}'")
    print(f"Respuesta esperada: '{sample_df['answers.text'].iloc[0]}'")
    print(f"¬øCoinciden? {respuesta_verificada == sample_df['answers.text'].iloc[0]}")
else:
    print(f"\n‚ö†Ô∏è  Ejemplo sin respuesta v√°lida (posible en SQuAD v2.0)")

# 1.11 Estad√≠sticas adicionales del dataset
print("\n--- ESTAD√çSTICAS DEL DATASET ---")
print(f"N√∫mero de contextos √∫nicos: {df_train['context'].nunique()}")
print(f"N√∫mero de t√≠tulos √∫nicos: {df_train['title'].nunique()}")
print(f"Longitud promedio del contexto: {df_train['context'].str.len().mean():.1f} caracteres")
print(f"Longitud promedio de la pregunta: {df_train['question'].str.len().mean():.1f} caracteres")
print(f"Longitud promedio de la respuesta: {df_train['answers.text'].str.len().mean():.1f} caracteres")

# Verificar si hay preguntas sin respuesta (com√∫n en SQuAD v2.0)
no_answer_count = df_train[df_train['answers.text'] == ''].shape[0]
print(f"Preguntas sin respuesta: {no_answer_count}")

print("\n--- RESUMEN ---")
print(f"‚úÖ Dataset {dataset_version} cargado exitosamente")
print("‚úÖ Estructura de QA extractiva verificada")
print("‚úÖ Datos listos para procesamiento con Haystack")
print("\nüéØ El dataset SQuAD es ideal para:")
print("- Entrenar modelos de Question Answering extractivo")
print("- Evaluar sistemas de QA")
print("- Benchmarking de rendimiento en comprensi√≥n de lectura")

‚ö†Ô∏è  ElasticsearchDocumentStore no disponible. Continuando sin √©l...
--- FASE 1: Exploraci√≥n y Preparaci√≥n de Datos (SQuAD) ---
Cargando dataset SQuAD desde Hugging Face...
‚úÖ SQuAD v1.1 cargado exitosamente

Dataset: SQuAD v1.1
Splits disponibles: ['train', 'validation']

N√∫mero de ejemplos en train: 87599
N√∫mero de ejemplos en validation: 10570
N√∫mero total de ejemplos: 98169

Columnas disponibles: ['id', 'title', 'context', 'question', 'answers']

--- Estructura de SQuAD ---
ID: 5733be284776f41900661182
T√≠tulo: University_of_Notre_Dame
Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper sta...
Pregunta: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Respuestas: {'text': array(['Saint Bernadette Soubirous'], dtype=object), 'answer_start': array([515], dtype=int32)}

--- Ejemplo de Estructura de QA

In [5]:
print("Muestra del DataFrame de entrenamiento (df_train):")
display(df_train.head())

print("\nMuestra del DataFrame de validaci√≥n (df_validation):")
display(df_validation.head())

Muestra del DataFrame de entrenamiento (df_train):


Unnamed: 0,id,title,context,question,answers,answers.text,answers.answer_start
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in fro...",To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?,"{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}",Saint Bernadette Soubirous,515
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in fro...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answer_start': [188]}",a copper statue of Christ,188
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in fro...",The Basilica of the Sacred heart at Notre Dame is beside to which structure?,"{'text': ['the Main Building'], 'answer_start': [279]}",the Main Building,279
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in fro...",What is the Grotto at Notre Dame?,"{'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]}",a Marian place of prayer and reflection,381
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in fro...",What sits on top of the Main Building at Notre Dame?,"{'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]}",a golden statue of the Virgin Mary,92



Muestra del DataFrame de validaci√≥n (df_validation):


Unnamed: 0,id,title,context,question,answers,answers.text,answers.answer_start
0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Foot...,Which NFL team represented the AFC at Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}",Denver Broncos,177
1,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Foot...,Which NFL team represented the NFC at Super Bowl 50?,"{'text': ['Carolina Panthers', 'Carolina Panthers', 'Carolina Panthers'], 'answer_start': [249, 249, 249]}",Carolina Panthers,249
2,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Foot...,Where did Super Bowl 50 take place?,"{'text': ['Santa Clara, California', 'Levi's Stadium', 'Levi's Stadium in the San Francisco Bay Area at Santa Clara, California.'], 'answer_start'...","Santa Clara, California",403
3,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Foot...,Which NFL team won Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}",Denver Broncos,177
4,56be4db0acb8001400a502f0,Super_Bowl_50,Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Foot...,What color was used to emphasize the 50th anniversary of the Super Bowl?,"{'text': ['gold', 'gold', 'gold'], 'answer_start': [488, 488, 521]}",gold,488
