# Import libraries and dataset

In [None]:
import pandas as pd
from datasets import Dataset
from dawid_skene_model import list2array, DawidSkeneModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from langchain.schema import HumanMessage, SystemMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from tqdm.notebook import tqdm
import numpy as np
from collections import Counter
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from huggingface_hub import login
from langchain.chat_models import ChatOpenAI
import os

In [None]:
login(token=os.getenv('HUGGINGFACE_TOKEN'))

# Biomedical dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")

In [None]:
ds

In [None]:
df = pd.DataFrame(ds["test"])
# Keep only columns of 'question' and 'answers'
qa = df[['question', 'answer']]
display(qa)

# Create big model reference

In [None]:
from huggingface_hub import InferenceClient
import json

token_pro = os.getenv('HUGGINGFACE_TOKEN')
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
    token=token_pro
)

def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        model=repo_id,  # Explicitly specify the model
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1028},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["generated_text"].strip()

In [None]:
import re
import pandas as pd
from tqdm import tqdm

# Register tqdm with pandas
tqdm.pandas()
    
def evaluate_pair(model, row):
    prompt = f"""You are a reviewer evaluating question-answer pairs for a biomedical FAQ.
        The question-answer pairs must meet the following criteria to be considered useful for evaluating a chatbot designed for biomedical customer support:

        1. **Relevance**: They should address topics relevant to biomedical research, healthcare, or medical information.
        2. **Logicality and Usefulness**: They should be logical and provide clear, practical information for users seeking biomedical knowledge.
        3. **Correctness and Clarity**: They should give correct and accurate information, fully clarify the question, and be understandable by a human reader.

        Evaluate the following question-answer pair and decide if it is useful for testing a biomedical support chatbot.  
        Respond exclusively with 'Yes' or 'No'.

        **Question:** {row['question']}  
        **Answer:** {row['answer']}  

        Respond exclusively with 'Yes' or 'No'."""

    if isinstance(model, InferenceClient):
        response = call_llm(model, prompt)  # Now passing a string, not a list
        answer = response.strip()
        return "Yes" if "Yes" in answer else "No"
    else:
        return model.generate(row['question'], row['answer'])

In [None]:
# ** Valutazione con Mixtral **
qa["Mixtral"] = qa.progress_apply(lambda row: evaluate_pair(llm_client, row), axis=1)


In [None]:
display(qa['Mixtral'].value_counts())

In [None]:
# ** Confronto tra GPT e Gemini **
for elem in qa['Mixtral']:
    if elem == "Yes":
        qa["Agreement Mixtral-Groundtruth"] = 1
    else:
        qa["Agreement Mixtral-Groundtruth"] = 0

display(qa)

In [None]:
# Percentage of Mixtral correct judgment
mixtral_vs_groundtruth = qa["Agreement Mixtral-Groundtruth"].mean() * 100
print("Correct judgment Mixtral:", mixtral_vs_groundtruth)

In [None]:
# Save the dataframe to a CSV file
qa.to_csv("faq_evaluation_results_with_Mixtral_on_open_data_biomedical.csv", index=False, encoding="utf-8")

# Create models - Ensemble

In [None]:
# Load the csv file
qa = pd.read_csv("faq_evaluation_results_with_Mixtral_on_open_data_biomedical.csv")
display(qa)

In [None]:
import torch
import pandas as pd
from tqdm import tqdm
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ** Ensemble Model Configuration **
MODEL_NAMES = {
    "DistilBERT": "distilbert/distilbert-base-uncased", 
    "MiniSBERT": "sentence-transformers/all-MiniLM-L6-v2",
    "MiniGptBased": "ComCom/gpt2-small", 
    "T5": "google-t5/t5-small"
}

class HuggingFaceModel:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        
        # **Aggiungi un token di padding se non presente**
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            self.model.resize_token_embeddings(len(self.tokenizer))  # Aggiorna la dimensione dei token nel modello

    def generate(self, question, answer):
        """Esegue la valutazione del modello sulla coppia domanda-risposta."""
        inputs = self.tokenizer(
            f"Question: {question} Answer: {answer}", 
            return_tensors="pt", 
            truncation=True,  
            max_length=512,  
            padding="max_length"
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return "Yes" if predicted_class == 1 else "No"

# ** Inizializzazione dei modelli dell'Ensemble **
ensemble_models = {name: HuggingFaceModel(model) for name, model in MODEL_NAMES.items()}

In [None]:
# save on csv also the ensemble models
qa.to_csv("faq_evaluation_results_with_ensemble_models_on_open_data_biomedical.csv", index=False, encoding="utf-8")

In [None]:
# ** Enable progress tracking **
tqdm.pandas()

# ** Evaluation with ensemble models **
for model_name, model in ensemble_models.items():
    qa[model_name] = qa.progress_apply(lambda row: evaluate_pair(model, row), axis=1)

In [None]:
# ** Majority Voting Function **
def majority_voting(row):
    votes = [row[model] for model in MODEL_NAMES.keys()]
    return Counter(votes).most_common(1)[0][0]  # Most voted option

# ** Compute Majority Voting **
qa["Majority Voting"] = qa.apply(majority_voting, axis=1)

In [None]:
display(qa)

In [None]:
# ** Compare with groundtruth **
qa["Agreement Majority-Mixtral"] = qa["Majority Voting"] == qa["Mixtral"]

# ** Compute Agreement Percentage **
majority_vs_mixtral = qa["Agreement Majority-Mixtral"].mean() * 100
print("Correct judgment Majority Voting vs Mixtral:", majority_vs_mixtral)

In [None]:
display(qa)

In [None]:
# ** Encoding delle risposte per l'analisi di consenso **
def encode_answers(df):
    return df.replace({"Yes": 1, "No": 0})

#encoded_df = encode_answers(df.iloc[:, 1:-1])  # Escludiamo la colonna delle domande
encoded_df = encode_answers(qa.iloc[:, 2:-1]).apply(pd.to_numeric, errors="coerce")
display(encoded_df)

In [None]:
# **Distribuzione delle risposte per ciascun modello**
print("\nDistribuzione delle risposte per ciascun modello:")
for model in MODEL_NAMES.keys():
    print(f"{model}:")
    print(qa[model].value_counts(normalize=True) * 100, "\n")

# **Distribuzione delle risposte per Mixtral**
print("\nDistribuzione delle risposte di Mixtral:")
print(qa["Mixtral"].value_counts(normalize=True) * 100)

In [None]:
# Definisci le colonne dei modelli
model_columns = ["DistilBERT", "MiniSBERT", "MiniGptBased", "T5"]

# FUNZIONI AUSILIARIE
# Converti il DataFrame in una lista nidificata (dataset_list)
def dataframe_to_dataset_list(df, model_columns):
    dataset_list = []
    for _, row in df.iterrows():
        task = []
        for model in model_columns:
            response = row[model]  # Prendi la risposta del modello
            task.append([0] if response == "No" else [1])  # Converti in formato numerico
        dataset_list.append(task)
    return dataset_list

# Converti il DataFrame in dataset_list
dataset_list = dataframe_to_dataset_list(qa, model_columns)

# Converti in tensore NumPy
class_num = 2  # Solo due classi: Sì (1) e No (0)
dataset_tensor = list2array(class_num, dataset_list)

# Inizializza e lancia il modello di Dawid & Skene
model = DawidSkeneModel(class_num=2, max_iter=40, tolerance=1e-5)
marginal_predict, error_rates, worker_reliability, predict_label = model.run(dataset_tensor)

# Converti le predizioni finali in "Sì" o "No"
final_answers = ["Yes" if p[1] > 0.5 else "No" for p in predict_label]

# Aggiungi i risultati al DataFrame
qa["Dawid & Skene Multi-Class"] = final_answers

In [None]:
# Mostra il confronto con Majority Voting e Gemini
qa["Agreement D&S-Mixtral"] = qa["Dawid & Skene Multi-Class"] == qa["Mixtral"]

# Calcola le percentuali di accordo
ds_multi_vs_gemini = qa["Agreement D&S-Mixtral"].mean() * 100

summary_df = pd.DataFrame({
    "Metodo": ["Majority Voting", "Dawid & Skene Multi-Class", ],
    "Concordanza con Groundtruth (%)": [majority_vs_mixtral, ds_multi_vs_gemini]
})

display(summary_df)
import seaborn as sns
import matplotlib.pyplot as plt

# Convertiamo le risposte in valori numerici per la heatmap
heatmap_df = qa[[
    "Majority Voting", 
    "Dawid & Skene Multi-Class"
]].map(lambda x: 1 if x == "Yes" else 0)

plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_df, annot=True, fmt="d", cmap="coolwarm", cbar=True)
plt.title("Confronto tra Majority Voting e Dawid & Skene")
plt.xlabel("Metodo di Ensemble o Modello")
plt.ylabel("Domande")
plt.show()