# Questions to test

In [None]:
question = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione?"
question_out_of_scope = "Quando è morto Giulio Cesare?"
multiple_questions = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione? Chi è Giulio Cesare?"
multiple_valid_questions = "Cosa significa che una fattura è in mancata consegna? Il cliente ha ricevuto la fattura?"
q_client = "Addebito bollo su nota credito. Su nota credito non mette più addebito bollo: precedente nota credito si."
q_client_without_object = "Su nota credito non mette più addebito bollo: precedente nota credito si."
q_rewritten = "Perché la nota di credito non sta aggiungendo più il bollo e come risolvere questo problema?"

# Llama3.2-instruct

In [None]:
import pprint
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

template = """Riformula la frase fornita in 5 modi diversi, mantenendo il senso della frase e rendendola più chiara. 
In output mostra solo la lista delle domande riformulate, senza altro testo o commenti.

Domanda originale: {question}"""
prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2:3b-instruct-fp16", temperature=0)

response = prompt | model

# ChatGPT-4o

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries_gpt = (
    prompt_perspectives
    | ChatOpenAI(temperature=0, model="gpt-4o") 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
pprint.pprint(question)
print("\n")
pprint.pprint("Llama3.2-instruct question rewriting with basic prompt")
pprint.pprint(response.invoke({"question": question}))
print("\n")
pprint.pprint("GPT-4o question rewriting with basic prompt")
pprint.pprint(generate_queries_gpt.invoke({"question": question}))

In [None]:
pprint.pprint(q_client)
print("\n")
pprint.pprint("Llama3.2-instruct question rewriting with basic prompt")
pprint.pprint(response.invoke({"question": q_client}))
print("\n")
pprint.pprint("GPT-4o question rewriting with basic prompt")
pprint.pprint(generate_queries_gpt.invoke({"question": q_client}))

# Evaluation metrics to compare the models for query re-writing

In [None]:
import textstat

def clarity_score(text: str) -> float:
    """Evaluate clarity based on Italian readability scores using the Gulpease Index."""
    # Calculate the Gulpease Index for Italian text
    readability = textstat.gulpease_index(text)
    # Translate score to a scale of 1-5
    if readability >= 60:
        return 5
    elif readability >= 40:
        return 4
    elif readability >= 20:
        return 3
    elif readability >= 10:
        return 2
    else:
        return 1


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List, Dict


def variety_score(rewrites: List[str]) -> float:
    """Evaluate variety among a list of rewrites in Italian."""
    vectorizer = CountVectorizer(analyzer='word').fit_transform(rewrites)
    vectors = vectorizer.toarray()
    
    # Compute pairwise cosine similarities
    cos_sim_matrix = cosine_similarity(vectors)
    
    # Compute variety as inverse of average similarity
    avg_similarity = (np.sum(cos_sim_matrix) - len(rewrites)) / (len(rewrites) * (len(rewrites) - 1))
    variety = 1 - avg_similarity  # Higher value indicates more variety
    
    # Scale to 1-5
    if variety >= 0.8:
        return 5
    elif variety >= 0.6:
        return 4
    elif variety >= 0.4:
        return 3
    elif variety >= 0.2:
        return 2
    else:
        return 1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def relevance_score(original: str, rewrite: str) -> float:
    """Evaluate relevance for Italian text based on keyword retention."""
    # Extract keywords from the original text using TF-IDF
    vectorizer = TfidfVectorizer()
    original_vector = vectorizer.fit_transform([original])
    rewrite_vector = vectorizer.transform([rewrite])
    
    # Compute cosine similarity on the keyword vector
    similarity = cosine_similarity(original_vector, rewrite_vector).item()
    # Translate similarity to a scale of 1-5
    if similarity >= 0.75:
        return 5
    elif similarity >= 0.5:
        return 4
    elif similarity >= 0.25:
        return 3
    elif similarity >= 0.1:
        return 2
    else:
        return 1


In [None]:
import language_tool_python

# Initialize the language tool for Italian
tool = language_tool_python.LanguageTool('it-IT')

def fluency_score(text: str) -> float:
    """Evaluate fluency based on grammar and syntax errors in Italian."""
    # Get list of errors from language tool
    errors = tool.check(text)
    num_errors = len(errors)
    # Scale inversely with the number of errors (fewer errors = higher score)
    if num_errors == 0:
        return 5
    elif num_errors <= 1:
        return 4
    elif num_errors <= 3:
        return 3
    elif num_errors <= 5:
        return 2
    else:
        return 1


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a Sentence-BERT model optimized for Italian
model = SentenceTransformer('dbmdz/bert-base-italian-xxl-cased')

def concept_retention_score(original: str, rewrite: str) -> float:
    """Evaluate concept retention for Italian text based on semantic similarity."""
    # Generate embeddings
    original_embedding = model.encode(original, convert_to_tensor=True)
    rewrite_embedding = model.encode(rewrite, convert_to_tensor=True)
    
    # Compute cosine similarity
    similarity = util.pytorch_cos_sim(original_embedding, rewrite_embedding).item()
    # Translate similarity to a scale of 1-5
    if similarity >= 0.9:
        return 5
    elif similarity >= 0.75:
        return 4
    elif similarity >= 0.6:
        return 3
    elif similarity >= 0.45:
        return 2
    else:
        return 1


In [None]:
def evaluate_rewrites(original: str, rewrites: List[str]) -> List[Dict[str, float]]:
    """
    Evaluate multiple rewrites of a given original text.
    
    Args:
    - original (str): The original query.
    - rewrites (List[str]): A list of rewritten queries.
    
    Returns:
    - List[Dict[str, float]]: A list of scores for each rewrite.
    """
    scores = []
    
    for rewrite in rewrites:
        score = {
            'clarity': clarity_score(rewrite),
            'concept_retention': concept_retention_score(original, rewrite),
            'variety': variety_score(rewrites),
            'fluency': fluency_score(rewrite),
            'relevance': relevance_score(original, rewrite)
        }
        scores.append(score)
    
    return scores

In [None]:
def aggregate_scores(scores: List[Dict[str, float]]) -> List[float]:
    """
    Aggregate scores from individual evaluations.
    
    Args:
    - scores (List[Dict[str, float]]): A list of score dictionaries.
    
    Returns:
    - List[float]: A list of aggregated scores for each rewrite.
    """
    aggregated_scores = []
    
    for score in scores:
        avg_score = sum(score.values()) / len(score)
        aggregated_scores.append(avg_score)
    
    return aggregated_scores


In [None]:
original_query = q_client

# Assuming `model_rewrites` is a dictionary containing rewrites from different models
model_rewrites = {
    "Llama 3.2-Instruct": {
        "prompt_language": "Italian",
        "outputs": [response.invoke({"question": original_query})],
    },
    "GPT-4o": {
        "prompt_language": "Italian",
        "outputs": [generate_queries_gpt.invoke({"question": original_query})],
    }
}

results = {}

for model_name, rewrites in model_rewrites.items():
    scores = evaluate_rewrites(original_query, rewrites)
    aggregated_scores = aggregate_scores(scores)
    results[model_name] = aggregated_scores

# Now `results` contains average scores for each model
pprint.pprint(results)

# Alternatives

In [None]:
import language_tool_python
from sentence_transformers import SentenceTransformer, util

# Initialize tools and models for Italian
grammar_tool = language_tool_python.LanguageTool('it')  # Specify Italian language
similarity_model = SentenceTransformer('stsb-xlm-r-multilingual')  # A multilingual model for sentence similarity

# Metric 1: Grammatical Correctness (Italian)
def evaluate_grammar_italian(text):
    matches = grammar_tool.check(text)
    error_count = len(matches)
    word_count = len(text.split())
    grammar_score = max(1 - (error_count / word_count), 0)  # Score between 0 and 1
    return grammar_score

# Metric 2: Clarity (Italian)
def evaluate_clarity_italian(text):
    # Readability indices are often language-specific, and textstat doesn’t directly support Italian.
    # For simplicity, we can use sentence and word length as a proxy for readability.
    # An alternative is the Gulpease Index, specifically for Italian.
    def gulpease_index(text):
        words = len(text.split())
        sentences = text.count('.') + text.count('!') + text.count('?')
        letters = sum(1 for c in text if c.isalpha())
        if words == 0 or sentences == 0: return 0  # Avoid division by zero
        return (89 - (10 * letters / words) + (300 * sentences / words))

    gulpease_score = gulpease_index(text)
    clarity_score = min(max((gulpease_score - 20) / 70, 0), 1)  # Normalize to 0-1 range
    return clarity_score

# Metric 3: Conciseness (Italian)
def evaluate_conciseness(text, max_word_count=50):
    word_count = len(text.split())
    conciseness_score = max(1 - (word_count / max_word_count), 0) if word_count > max_word_count else 1
    return conciseness_score

# Metric 4: Relevance (Italian)
def evaluate_relevance(response, question):
    # Compute embeddings and similarity for Italian using multilingual model
    embeddings = similarity_model.encode([response, question], convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    relevance_score = float(similarity[0][0])  # Cosine similarity score between 0 and 1
    return relevance_score


In [None]:
from typing import List, Dict

def evaluate_rewrites(original: str, rewrites: List[str]) -> List[Dict[str, float]]:
    """
    Evaluate multiple rewrites of a given original text.
    
    Args:
    - original (str): The original query.
    - rewrites (List[str]): A list of rewritten queries.
    
    Returns:
    - List[Dict[str, float]]: A list of scores for each rewrite.
    """
    scores = []
    
    for rewrite in rewrites:
        score = {
            'grammar': evaluate_grammar_italian(rewrite),   # Grammatical Correctness
            'clarity': evaluate_clarity_italian(rewrite),     # Clarity
            'conciseness': evaluate_conciseness(rewrite),      # Conciseness
            'relevance': evaluate_relevance(rewrite, original)  # Relevance to the original question
        }
        scores.append(score)
    
    return scores


In [None]:
def aggregate_scores(scores: List[Dict[str, float]]) -> List[float]:
    """
    Aggregate scores from individual evaluations.
    
    Args:
    - scores (List[Dict[str, float]]): A list of score dictionaries.
    
    Returns:
    - List[float]: A list of aggregated scores for each rewrite.
    """
    aggregated_scores = []
    
    for score in scores:
        avg_score = sum(score.values()) / len(score)
        aggregated_scores.append(avg_score)
    
    return aggregated_scores


In [None]:
original_query = q_client

# Assuming `model_rewrites` is a dictionary containing rewrites from different models
model_rewrites = {
    "Llama 3.2-Instruct": {
        "prompt_language": "Italian",
        "outputs": [response.invoke({"question": original_query})],
    },
    "GPT-4o": {
        "prompt_language": "Italian",
        "outputs": [generate_queries_gpt.invoke({"question": original_query})],
    }
}

results = {}

for model_name, rewrites in model_rewrites.items():
    scores = evaluate_rewrites(original_query, rewrites)
    aggregated_scores = aggregate_scores(scores)
    results[model_name] = aggregated_scores

# Now `results` contains average scores for each model
pprint.pprint(results)

# Build a synthetic dataset for evaluation

In [None]:
import pandas as pd
filename_all_data_dict = "./Files/final_dataset.csv"

data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
data_df = data_df.drop(index = 0)
data_df

In [None]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(data_df, page_content_column="text")
docs_data = loader.load()
docs_data[0]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Split
# Possible improvements - future hypertuning of chunk_size and chunk_overlap to improve results and try different slitters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)

In [None]:
docs_processed = []
for doc in docs_data:
    docs_processed += text_splitter.split_documents([doc])

docs_processed[0:6]

In [None]:
from huggingface_hub import InferenceClient
import json

access_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
token_pro = os.getenv('HUGGINGFACE_TOKEN')
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
    token=token_pro
)


def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1024},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["generated_text"]

In [None]:
QA_generation_prompt = """
Il tuo compito è scrivere una domanda e una risposta data un contesto.
La tua domanda deve essere rispondibile con un'informazione specifica dal contesto. Se nel contesto ci sono errori grammaticali o morfologici correggili nell'output fornito.
La tua domanda deve essere formulata nello stesso stile delle domande che gli utenti potrebbero porre ad un helpdesk, che si occupa di assistenza clienti per un software aziendale.
Questo significa che la tua domanda NON deve menzionare frasi come "secondo il passaggio" o "nel contesto". 
La tua domanda può menzionare frasi come "Ho un errore" o "Come posso sistemare il problema".

Domanda e risposta devono essere generate in italiano.

Fornisci la tua risposta come segue:

Output:::
Domanda: (la tua domanda)
Risposta: (la tua risposta alla domanda)

Ora ecco il contesto.

Contesto: {context}\n
Output:::"""


In [None]:
import random
document = random.sample(docs_processed, 1)
pprint.pprint(document)
for sampled_document in document:
    pprint.pprint(call_llm(llm_client, QA_generation_prompt.format(context=sampled_document.page_content)))

In [None]:
import random
from tqdm.auto import tqdm

N_GENERATIONS = 100

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = call_llm(llm_client, QA_generation_prompt.format(context=sampled_context.page_content))
    try:
        question = output_QA_couple.split("Domanda: ")[-1].split("Risposta: ")[0]
        answer = output_QA_couple.split("Risposta: ")[-1]

        outputs.append(
            {
                "question": question,
                "answer": answer,
                "context": sampled_context.page_content,
                "source_doc": sampled_context.metadata["file"],
            }
        )
    except:
        continue

In [None]:
import os
import csv
import re
import pandas as pd

def get_next_run_filename(base_name='generated_QA_couples', directory='.', extension='csv'):
    # Find all files with the base name and extension in the specified directory
    existing_files = [f for f in os.listdir(directory) if f.startswith(base_name) and f.endswith(f'.{extension}')]
    
    # Extract run numbers from file names and find the maximum
    run_numbers = [int(re.search(rf'{base_name}_(\d+)\.{extension}', f).group(1)) for f in existing_files if re.search(rf'{base_name}_(\d+)\.{extension}', f)]
    next_run_number = max(run_numbers) + 1 if run_numbers else 1
    
    # Construct the new file name
    return os.path.join(directory, f"{base_name}_{next_run_number}.{extension}")

def save_outputs_to_csv(outputs, base_name='generated_QA_couples'):
    # Generate the next run file name
    csv_file_path = get_next_run_filename(base_name=base_name)
    
    # Define the column headers
    fieldnames = ["question", "answer", "context", "source_doc"]

    # Write only the new outputs to a new CSV file with headers
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter=';')
        writer.writeheader()
        writer.writerows(outputs)
    
    print(f"Data successfully saved to {csv_file_path}")

# Usage example for saving new outputs
save_outputs_to_csv(outputs)

## Setup a critique agent

In [None]:
question_groundedness_critique_prompt = """
Sarà fornito un contesto e una domanda.
Il tuo compito è di fornire una valutazione per indicare quanto bene si possa rispondere in modo univoco alla domanda data con il contesto fornito.
Dai la tua risposta su una scala da 1 a 5, dove 1 significa che la domanda non è affatto rispondibile con il contesto, 
e 5 significa che la domanda è chiaramente e univocamente rispondibile con il contesto.

Fornisci la tua risposta esattamente nel seguente formato:

Output:::
Valutazione totale: (il tuo punteggio, come numero tra 1 e 5)
Output:::

Ora ecco la domanda e il contesto.

Domanda: {question}
Contesto: {context}

Output:::
"""

question_relevance_critique_prompt = """
Ti sarà fornita una domanda.
Il tuo compito è di fornire una "valutazione totale" che rappresenti quanto utile possa essere questa domanda per gli utenti che chiedono assistenza all'help desk riguardo a specifiche funzionalità
del software gestionale e la relativa documentazione.
Dai la tua risposta su una scala da 1 a 5, dove 1 significa che la domanda non è per nulla utile, e 5 significa che la domanda è estremamente utile.

Fornisci la tua risposta esattamente nel seguente formato:

Output:::
Valutazione totale: (il tuo punteggio, come numero tra 1 e 5)
Output:::

Ora ecco la domanda.

Domanda: {question}

Output:::
"""

question_standalone_critique_prompt = """
Ti sarà fornita una domanda.
Il tuo compito è di fornire una "valutazione totale" che rappresenti quanto questa domanda sia indipendente dal contesto.
Dai la tua risposta su una scala da 1 a 5, dove 1 significa che la domanda dipende da informazioni aggiuntive per essere compresa, e 5 significa che la domanda ha senso da sola.
Ad esempio, se la domanda si riferisce a un contesto particolare, come "nel contesto" o "nel documento", la valutazione deve essere 1.
Le domande possono contenere termini tecnici o acronimi e ricevere comunque una valutazione di 5: deve semplicemente essere chiaro per un operatore con accesso alla documentazione di cosa tratta la domanda.

Fornisci la tua risposta esattamente nel seguente formato:

Output:::
Valutazione totale: (il tuo punteggio, come numero tra 1 e 5)
Output:::

Ora ecco la domanda.

Domanda: {question}

Output:::
"""

In [None]:
import time
print("Generating critique for each QA couple...")

for output in tqdm(outputs):
    time.sleep(1)
    evaluations = {
        "groundedness": call_llm(
            llm_client,
            question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
        )
    }
    
    # Initialize scores with None as default values
    output.update({
        "groundedness_score": None,
    })

    # Example code with regex substitution
    for criterion, evaluation in evaluations.items():
    
        # Use regex to find the score following "Valutazione totale:"
        match = re.search(r"Valutazione totale:\s*(\d+)", evaluation)
    
        # Extract the score if the match is found, else set it to a default value (e.g., 0 or None)
        score = int(match.group(1)) if match else 0
    
        output.update(
            {
                f"{criterion}_score": score
            }
        )

In [None]:
import time
print("Generating critique for each QA couple...")

for output in tqdm(outputs):
    time.sleep(1)
    evaluations = {
        "relevance": call_llm(
            llm_client,
            question_relevance_critique_prompt.format(question=output["question"]),
        )
    }
    
    # Initialize scores with None as default values
    output.update({
        "relevance_score": None,
    })

    # Example code with regex substitution
    for criterion, evaluation in evaluations.items():
    
        # Use regex to find the score following "Valutazione totale:"
        match = re.search(r"Valutazione totale:\s*(\d+)", evaluation)
    
        # Extract the score if the match is found, else set it to a default value (e.g., 0 or None)
        score = int(match.group(1)) if match else 0
    
        output.update(
            {
                f"{criterion}_score": score
            }
        )

In [None]:
import time
print("Generating critique for each QA couple...")

for output in tqdm(outputs):
    time.sleep(1)
    evaluations = {
        "standalone": call_llm(
            llm_client,
            question_relevance_critique_prompt.format(question=output["question"]),
        )
    }
    
    # Initialize scores with None as default values
    output.update({
        "standalone_score": None,
    })

    # Example code with regex substitution
    for criterion, evaluation in evaluations.items():
    
        # Use regex to find the score following "Valutazione totale:"
        match = re.search(r"Valutazione totale:\s*(\d+)", evaluation)
    
        # Extract the score if the match is found, else set it to a default value (e.g., 0 or None)
        score = int(match.group(1)) if match else 0
    
        output.update(
            {
                f"{criterion}_score": score
            }
        )

In [None]:
import pandas as pd
import datasets

pd.set_option("display.max_colwidth", None)

# Create DataFrame after ensuring all columns are initialized
generated_questions = pd.DataFrame.from_dict(outputs)

# Calculate the average score across the three columns
generated_questions["average_score"] = (
    generated_questions["groundedness_score"] 
    + generated_questions["relevance_score"]
    + generated_questions[ "standalone_score"]
)/3

# Filter to keep rows where the average score is greater than 4
generated_questions = generated_questions.loc[generated_questions["average_score"] > 4]

print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
            "average_score"
        ]
    ]
)

# Create the dataset from filtered DataFrame
new_eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)

In [None]:
display(new_eval_dataset)

## Append new data in the future

In [None]:
from datasets import Dataset
import pandas as pd

# Load existing dataset
existing_dataset = Dataset.load_from_disk("eval_dataset")
old_elem = pd.DataFrame(existing_dataset)
new_elem = pd.DataFrame(new_eval_dataset)

# Concatenate the old and new DataFrames
combined_df = pd.concat([old_elem, new_elem], ignore_index=True)
combined_df = combined_df.drop_duplicates()

# Convert the combined DataFrame back to a Dataset
combined_dataset = Dataset.from_pandas(combined_df)
combined_dataset

In [None]:
# Save updated dataset to disk again
combined_dataset.save_to_disk("eval_dataset")