In [None]:
from datasets import load_dataset

In [None]:
eval_dataset = load_dataset("json", data_files="gemini_ds.json")["train"]

In [None]:
from langchain_community.llms import Ollama
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.evaluation import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset

# ⚙️ Instancia el modelo Ollama
ollama_llm = Ollama(model="llama3.1:8b", timeout=60)  # Puedes usar otro como mistral, llama2, etc.

# ⚙️ Wrappers
llm = LangchainLLMWrapper(ollama_llm)

embed_model = LangchainEmbeddingsWrapper(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)

# ⚙️ Asigna modelos a las métricas
faithfulness.llm = llm
faithfulness.embeddings = embed_model

answer_relevancy.llm = llm
answer_relevancy.embeddings = embed_model

context_precision.llm = llm
context_precision.embeddings = embed_model

context_recall.llm = llm
context_recall.embeddings = embed_model

# # ⚠️ Usa batch_size pequeño


# pri

In [None]:
results = evaluate(
    eval_dataset,  # tu Dataset ya limpio con 'question', 'contexts', 'answer' y 'ground_truth'
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

In [None]:
groq_dataset = load_dataset("json", data_files="groq_ds.json")["train"]

In [None]:
# ⚠️ Usa batch_size pequeño
results2 = evaluate(
    groq_dataset,  # tu Dataset ya limpio con 'question', 'contexts', 'answer' y 'ground_truth'
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

print(results2)

In [None]:
qwen_ds = load_dataset("json", data_files="qwen_ds.json")["train"]

In [None]:
results3 = evaluate(
    qwen_ds,
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

In [None]:
print(results3)

#### RESULTADOS PRIMERA COMBINACIÓN


- chunk_size=512, chunk_overlap=150
- temperatura de los modelos = 0.1
- weights=[0.5, 0.5]  -> FIJOS 

In [32]:
print(results3)
print(results)
print(results2)


{'faithfulness': 0.5758, 'answer_relevancy': 0.4667, 'context_precision': 0.5831}
{'faithfulness': 0.5005, 'answer_relevancy': 0.4123, 'context_precision': 0.4489}
{'faithfulness': 0.5976, 'answer_relevancy': 0.3528, 'context_precision': 0.4911}


In [None]:
results_df = results.to_pandas()
results_df.to_csv("gemini1.csv", index=False)

results2_df = results2.to_pandas()
results2_df.to_csv("groq1.csv", index=False)

results3_df = results3.to_pandas()
results3_df.to_csv("qwen1.csv", index=False)

### PRUEBA SEGUNDA CONFIGURACIÓN

In [None]:
gemini_ds_2 = load_dataset("json", data_files="gemini_ds_2.json")["train"]

In [None]:
gemini2_results  = evaluate(
    gemini_ds_2,
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

In [None]:
groq_ds2 = load_dataset("json", data_files="groq_ds2.json")["train"]

In [None]:
groq2_result  = evaluate(
    groq_ds2,
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

In [None]:
qwen_ds2 = load_dataset("json", data_files="qwen_ds2.json")["train"]

In [None]:
qwen2_result  = evaluate(
    qwen_ds2,
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)


In [28]:
print(groq2_result)
print(gemini2_results)
print(qwen2_result)

{'faithfulness': 0.6255, 'answer_relevancy': 0.2404, 'context_precision': 0.4920}
{'faithfulness': 0.5923, 'answer_relevancy': 0.4211, 'context_precision': 0.4805}
{'faithfulness': 0.5783, 'answer_relevancy': 0.4121, 'context_precision': 0.4941}


In [36]:
gemini2_df = gemini2_results.to_pandas()
gemini2_df.to_csv("gemini2.csv", index=False)
groq2_df = groq2_result.to_pandas()
groq2_df.to_csv("groq2.csv", index=False)
qwen2_df = qwen2_result.to_pandas()
qwen2_df.to_csv("qwen2.csv", index=False)

In [None]:
groq_avanzado = load_dataset("json", data_files="groq_advanced_ds.json")["train"]

In [None]:
groq_avanzado  = evaluate(
    groq_avanzado,
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

In [None]:
gemini_avanzado_ds = load_dataset("json", data_files="gemini_advanced_ds.json")["train"]

In [None]:
gemini_avanzado  = evaluate(
    gemini_avanzado_ds,
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

In [None]:
qwen_avanzado_ds = load_dataset("json", data_files="qwen_advanced_ds.json")["train"]

In [None]:
qwen_avanzado = evaluate(
    qwen_avanzado_ds,
    metrics=[faithfulness, answer_relevancy, context_precision],
    batch_size=1
)

In [27]:
print(groq_avanzado)
print(gemini_avanzado)
print(qwen_avanzado)

{'faithfulness': 0.6417, 'answer_relevancy': 0.5475, 'context_precision': 0.5904}
{'faithfulness': 0.6557, 'answer_relevancy': 0.4179, 'context_precision': 0.6650}
{'faithfulness': 0.7249, 'answer_relevancy': 0.5650, 'context_precision': 0.6690}


In [37]:
geminiavanzado_df = gemini_avanzado.to_pandas()
geminiavanzado_df.to_csv("gemini_avanzado.csv", index=False)

groqavanzado_df = groq_avanzado.to_pandas()
groqavanzado_df.to_csv("groq_avanzado.csv", index=False)

qwenavanzado_df = qwen_avanzado.to_pandas()
qwenavanzado_df.to_csv("qwen_avanzado.csv", index=False)

#### Presentación de resultados

## Métricas RAGAS – Comparativa entre Configuraciones

## Primera Arquitectura

#### Configuración 1: `chunk_size=512`, `overlap=150`, `temp=0.1`

| Modelo | Faithfulness  | Answer Relevancy  | Context Precision  |
|--------|:--------------:|:------------------:|:-------------------:|
| **Gemini 2.0 Flash** | 0.5758 | 0.4667 | 0.5831 |
| **LLaMA 3.3 70B** | 0.5005 | 0.4123 | 0.4489 |
| **Qwen 3 32B** | 0.5976 | 0.3528 | 0.4911 |

#### Configuración 2: `chunk_size=800`, `overlap=250`, `temp=0.5`

| Modelo | Faithfulness  | Answer Relevancy  | Context Precision  |
|--------|:--------------:|:------------------:|:-------------------:|
| **Gemini 2.0 Flash** | **0.6255** ⭐ | **0.2404** ⭐ | 0.4920 |
| **LLaMA 3.3 70B** | **0.5923** ⭐ | 0.4211 | **0.4805** ⭐ |
| **Qwen 3 32B** | 0.5783 | **0.4121** ⭐ | **0.4941** ⭐ |

---

### Mejores Rendimientos por Métrica:

- **Faithfulness**: Gemini 2.0 Flash (Config 2) - 0.6255
- **Answer Relevancy**: Gemini 2.0 Flash (Config 1) - 0.4667  
- **Context Precision**: Qwen 3 32B (Config 2) - 0.4941

## Métricas RAGAS – Segunda Arquitectura

| Modelo | Faithfulness  | Answer Relevancy  | Context Precision  |
|--------|:--------------:|:------------------:|:-------------------:|
| **Gemini 2.0 Flash** | 0.6557 | 0.4179 | 0.6650 |
| **LLaMA 3.3 70B (Groq)** | 0.6417 | 0.5475 | 0.5904 |
| **Qwen 3 32B** | **0.7249** ⭐ | **0.5650** ⭐ | **0.6690** ⭐ |

---

### Mejores Rendimientos por Métrica:

- **Faithfulness**: Qwen 3 32B - 0.7249
- **Answer Relevancy**: Qwen 3 32B - 0.5650  
- **Context Precision**: Qwen 3 32B - 0.6690

Qwen 3 32B lidera en todas las métricas de evaluación