# Questions to test

In [None]:
question = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione?"
question_out_of_scope = "Quando è morto Giulio Cesare?"
multiple_questions = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione? Chi è Giulio Cesare?"
multiple_valid_questions = "Cosa significa che una fattura è in mancata consegna? Il cliente ha ricevuto la fattura?"
q_client = "Addebito bollo su nota credito. Su nota credito non mette più addebito bollo: precedente nota credito si."
q_client_without_object = "Su nota credito non mette più addebito bollo: precedente nota credito si."
q_rewritten = "Perché la nota di credito non sta aggiungendo più il bollo e come risolvere questo problema?"

# Question re-writing to have -> Rewrite - retrieve -read

## Zero shot - compare english and italian prompting

In [None]:
from langchain_ollama import ChatOllama
import pprint

model_llama = ChatOllama(
    model="llama3.2", 
    temperature=0
)

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an AI language model assistant. Your task is to generate five different versions, in Italian, of the given user question to retrieve relevant documents from a vector database. 
The context of our application is related to Enterprise Resource Planning (ERP) software's technical manuals (specifically Panthera software) or, more generally, topics related to computer science, including system configuration,
module functionality, troubleshooting, and implementation guidelines.
Your goal is to generate multiple perspectives on the question to help the user overcome limitations of distance-based similarity search while focusing strictly on the context of ERP software documentation
or relevant computer science topics.
In cases where the user provides multiple questions, only respond to the relevant ones related to ERP documentation or computer science, generating five different versions of the relevant question. 
Provide these alternative questions separated by newlines.

Before generating alternatives, ensure the user's question is related to ERP technical documentation or relevant computer science topics. 
If any of the questions are out of scope or irrelevant to ERP manuals or computer science topics, disregard them entirely. 
You don't need to ignore all the questions, but only the ones that are out of scope.

Use the ERP context only as information, but do not mention it in the rewritten questions.
Provide the created alternative questions separated by newlines, and structure the output to contain only the rewritten questions in a bullet list.
Output only the bullet list of the rewritten questions, without any specification about the out of scope parts of the question.

**Instructions:**
1. Generate rewritten questions that maintain the original meaning, exploring different formulations.
2. Ignore questions not relevant to ERP manuals or computer science topics.
3. If there are not questions relevant to the context just specify that you cannot answer to out of scope demands.
4. If there is any relevant question, provide alternative questions in a bullet-pointed list with new lines separating them, rewriting only the questions relevant to the application.
5. The output should contain only the rewritten questions, without explanations or comments. Do not add any comment about the not questions.

Original question: {question}
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_perspectives 
    | model_llama
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
template_italian = """Sei un modello linguistico AI che svolge il ruolo di assistente clienti per il software Panthera. 
Il tuo compito è generare cinque versioni diverse della domanda fornita dall'utente per recuperare documenti rilevanti da un database vettoriale. 
Il contesto riguarda manuali tecnici di software per la gestione aziendale. Non fornire la riscrittura della domanda se non è relativa al contesto.

**Istruzioni:**
1. Genera domande riscritte che mantengano il significato originale, esplorando diverse formulazioni e angolazioni. 
2. Ignora le domande che non sono pertinenti ai manuali di software gestionale o agli argomenti di informatica.
3. Fornisci le domande alternative in un elenco puntato separato da nuove righe.
4. L'output deve contenere solo le domande riscritte, senza spiegazioni o commenti.

Svolgi la task solo per le domande rilevanti al contesto del software. In questo caso, cerca di migliorare la formulazione originale 
esplorando diverse angolazioni che aiutino a comprendere meglio il problema o la richiesta, rendendo più chiare e leggibili le domande per un utente generico.

Domanda originale: {question} """

prompt_perspectives_ita = ChatPromptTemplate.from_template(template_italian)

from langchain_core.output_parsers import StrOutputParser

generate_queries_ita = (
    prompt_perspectives_ita 
    | model_llama
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)


### Compare english and italian prompting

In [None]:
pprint.pprint(question)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": question})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": question})
pprint.pprint(rewritten_question_ita)

In [None]:
pprint.pprint(question_out_of_scope)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": question_out_of_scope})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": question_out_of_scope})
pprint.pprint(rewritten_question_ita)

In [None]:
pprint.pprint(multiple_questions)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": multiple_questions})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": multiple_questions})
pprint.pprint(rewritten_question_ita)

In [None]:
pprint.pprint(multiple_valid_questions)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": multiple_valid_questions})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": multiple_valid_questions})
pprint.pprint(rewritten_question_ita)

In [None]:
pprint.pprint(q_client)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": q_client})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": q_client})
pprint.pprint(rewritten_question_ita)

In [None]:
pprint.pprint(q_client_without_object)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": q_client_without_object})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": q_client_without_object})
pprint.pprint(rewritten_question_ita)

In [None]:
pprint.pprint(q_rewritten)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": q_rewritten})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": q_rewritten})
pprint.pprint(rewritten_question_ita)

## Few shot

In [None]:
template_italian_few_shot = """Sei un modello linguistico AI che svolge il ruolo di assistente clienti per il software Panthera. 
Il tuo compito è generare cinque versioni diverse delle domande fornite dall'utente per recuperare documenti rilevanti da un database vettoriale. 
Il contesto riguarda manuali tecnici di software per la gestione aziendale. Non fornire la riscrittura della domanda se non è relativa al contesto.

**Istruzioni:**
1. Valuta attentamente la domanda fornita. Se contiene più interrogativi, identifica quelli pertinenti ai manuali di software gestionale o agli argomenti di informatica e riformula solo quelli.
2. Se ci sono domande pertinenti, genera cinque versioni riscritte per ciascuna di esse, mantenendo il significato originale ed esplorando diverse formulazioni e angolazioni.
3. Se non ci sono domande pertinenti nella richiesta, non fornire alcuna risposta e indica che non è possibile fornire una risposta.
4. L'output deve contenere solo le domande riscritte, senza spiegazioni o commenti.

**Esempio di riformulazione di una domanda dell'utente, relativa al contesto del software Panthera:**
<Domanda>: <Addebito bollo su nota credito. Su nota credito non mette più addebito bollo: precedente nota credito si.>
<Risposta>:
< * Perché l’addebito del bollo non viene più applicato sulla nota di credito, mentre su una precedente nota di credito era stato inserito?
  * Come mai sulla nuova nota di credito manca l’addebito del bollo, che invece era presente su una precedente?
  * Qual è il motivo per cui il bollo non viene più addebitato sulla nota di credito, a differenza di quanto accaduto prima?
  * Per quale ragione il bollo non è stato addebitato sulla nota di credito corrente, quando su una nota precedente era presente?
  * Perché l’applicazione del bollo su una nota di credito attuale non avviene, mentre su una nota di credito passata era presente? >

**Esempio di domanda non rilevante al contesto, a cui il sistema non deve rispondere:**
<Domanda>: <Quale è la migliore ricetta per fare la carbonara?>
<Risposta>: <Non posso fornire una risposta alla tua domanda in quanto non è relativa al contesto del software Panthera. Posso aiutarti con qualcos'altro?>

Domanda originale: {question}
"""

prompt_perspectives_ita_few_shot = ChatPromptTemplate.from_template(template_italian_few_shot)

from langchain_core.output_parsers import StrOutputParser

generate_queries_ita_few_shot = (
    prompt_perspectives_ita_few_shot
    | model_llama
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)


In [None]:
pprint.pprint(question)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": question})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": question})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Few-shot results")
pprint.pprint("Italian prompting with some examples:")
rewritten_question_ita_few_shot = generate_queries_ita_few_shot.invoke({"question": question})
pprint.pprint(rewritten_question_ita_few_shot)

In [None]:
pprint.pprint(multiple_valid_questions)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": multiple_valid_questions})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": multiple_valid_questions})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Few-shot results")
pprint.pprint("Italian prompting with some examples:")
rewritten_question_ita_few_shot = generate_queries_ita_few_shot.invoke({"question": multiple_valid_questions})
pprint.pprint(rewritten_question_ita_few_shot)

In [None]:
pprint.pprint(question_out_of_scope)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": question_out_of_scope})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": question_out_of_scope})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Few-shot results")
pprint.pprint("Italian prompting with some examples:")
rewritten_question_ita_few_shot = generate_queries_ita_few_shot.invoke({"question": question_out_of_scope})
pprint.pprint(rewritten_question_ita_few_shot)

In [None]:
pprint.pprint(multiple_questions)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": multiple_questions})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": multiple_questions})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Few-shot results")
pprint.pprint("Italian prompting with some examples:")
rewritten_question_ita_few_shot = generate_queries_ita_few_shot.invoke({"question": multiple_questions})
pprint.pprint(rewritten_question_ita_few_shot)

# Compare llama3.2 rewriting results with GPT 

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an AI language model assistant. Your task is to generate five different versions, in Italian, of the given user question to retrieve relevant documents from a vector database. 
The context of our application is related to Enterprise Resource Planning (ERP) software's technical manuals (specifically Panthera software) or, more generally, topics related to computer science, including system configuration,
module functionality, troubleshooting, and implementation guidelines.
Your goal is to generate multiple perspectives on the question to help the user overcome limitations of distance-based similarity search while focusing strictly on the context of ERP software documentation
or relevant computer science topics.
In cases where the user provides multiple questions, only respond to the relevant ones related to ERP documentation or computer science, generating five different versions of the relevant question. 
Provide these alternative questions separated by newlines.

Before generating alternatives, ensure the user's question is related to ERP technical documentation or relevant computer science topics. 
If any of the questions are out of scope or irrelevant to ERP manuals or computer science topics, disregard them entirely. 
You don't need to ignore all the questions, but only the ones that are out of scope.

Use the ERP context only as information, but do not mention it in the rewritten questions.
Provide the created alternative questions separated by newlines, and structure the output to contain only the rewritten questions in a bullet list.
Output only the bullet list of the rewritten questions, without any specification about the out of scope parts of the question.

**Instructions:**
1. Generate rewritten questions that maintain the original meaning, exploring different formulations.
2. Ignore questions not relevant to ERP manuals or computer science topics.
3. If there are not questions relevant to the context just specify that you cannot answer to out of scope demands.
4. If there is any relevant question, provide alternative questions in a bullet-pointed list with new lines separating them, rewriting only the questions relevant to the application.
5. The output should contain only the rewritten questions, without explanations or comments. Do not add any comment about the not relevant questions.

Original question: {question}
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries_gpt = (
    prompt_perspectives_ita 
    | ChatOpenAI(temperature=0, model="gpt-4o") 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
pprint.pprint(q_client)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": q_client})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": q_client})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Italian prompting + GPT")
pprint.pprint(generate_queries.invoke({"question": q_client}))

In [None]:
pprint.pprint(question)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": question})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": question})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Italian prompting + GPT:")
pprint.pprint(generate_queries.invoke({"question": question}))

In [None]:
pprint.pprint(question_out_of_scope)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": question_out_of_scope})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": question_out_of_scope})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Italian prompting + GPT:")
pprint.pprint(generate_queries.invoke({"question": question_out_of_scope}))

In [None]:
pprint.pprint(multiple_questions)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": multiple_questions})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": multiple_questions})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Italian prompting + GPT:")
pprint.pprint(generate_queries.invoke({"question": multiple_questions}))

In [None]:
pprint.pprint(multiple_valid_questions)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": multiple_valid_questions})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": multiple_valid_questions})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Italian prompting + GPT:")
pprint.pprint(generate_queries.invoke({"question": multiple_valid_questions}))

In [None]:
pprint.pprint(q_client_without_object)
print("")
pprint.pprint("Zero-shot results")
pprint.pprint("English prompting:")
rewritten_question = generate_queries.invoke({"question": q_client_without_object})
pprint.pprint(rewritten_question)
print("\n")
pprint.pprint("Italian prompting:")
rewritten_question_ita = generate_queries_ita.invoke({"question": q_client_without_object})
pprint.pprint(rewritten_question_ita)
print("\n")
pprint.pprint("Italian prompting + GPT:")
pprint.pprint(generate_queries.invoke({"question": q_client_without_object}))
pprint.pprint("Italian prompting + GPT:")
pprint.pprint(generate_queries.invoke({"question": q_client}))

# Reranking of the rewritten questions

After reranking of the rewritten questions we can decide to:
1. Weight the retrieved documents based on the reranking score
2. Retrieve documents only for the best reformulations of the question - set appropriately the threshold of good rewritten questions

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

def compute_cosine_similarity(embedding, embeddings):
    """Computes cosine similarity between one vector and a set of vectors."""
    # Compute the dot product between the single vector and each vector in embeddings
    dot_products = np.dot(embeddings, embedding)
    
    # Compute the norms
    norm_embedding = np.linalg.norm(embedding)
    norms_embeddings = np.linalg.norm(embeddings, axis=1)
    
    # Compute cosine similarities
    cosine_similarities = dot_products / (norm_embedding * norms_embeddings)
    
    return cosine_similarities


def compute_bm25_score(original, rewrites):
    """Computes BM25 scores for original question against rewritten questions."""
    # Prepare documents for BM25
    documents = [original] + rewrites
    tokenized_documents = [doc.split(" ") for doc in documents]
    
    # Initialize BM25
    bm25 = BM25Okapi(tokenized_documents)
    
    # Get BM25 scores for the original question against all rewrites
    scores = bm25.get_scores(tokenized_documents[0])
    
    return scores[1:]  # Exclude the score for the original question

def rerank_questions(original_question, alpha=1.0, threshold=0.0):
    """Rerank the rewritten questions based on cosine similarity and BM25 scores."""
    
    # Load a pre-trained model
    model = SentenceTransformer("BAAI/bge-m3")
    
    # Generate embeddings
    original_embedding = model.encode(original_question)
    rewritten_questions = generate_queries_gpt.invoke({"question": original_question})
    rewritten_embeddings = model.encode(rewritten_questions)
    pprint.pprint(original_question)

    # Convert list of embeddings to a numpy array
    rewritten_embeddings = np.vstack(rewritten_embeddings)  # Stack into a single 2D array
    
    # Compute cosine similarities
    cosine_similarities = compute_cosine_similarity(original_embedding, rewritten_embeddings)
    
    # Compute BM25 scores
    bm25_scores = compute_bm25_score(original_question, rewritten_questions)

    # Combine scores
    weighted_scores = alpha * cosine_similarities + (1 - alpha) * bm25_scores
    
    # Create a ranking of rewritten questions
    ranked_indices = np.argsort(weighted_scores)[::-1]  # Sort in descending order
    
    # Filter questions based on the threshold
    filtered_questions = [(rewritten_questions[i], weighted_scores[i]) for i in ranked_indices if weighted_scores[i] >= threshold]
    questions = [original_question]
    for question, score in filtered_questions:
        print(question, score)
        questions.append(question)
    
    return questions

# Rerank questions and apply the threshold
filtered_questions = rerank_questions(q_client_without_object)


# Retrieve the documents based on query preprocessing and ranking

In [None]:
from FlagEmbedding import BGEM3FlagModel

model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
from langchain_community.vectorstores import FAISS
vectorstore_fp16 = FAISS.load_local("local_model_index", M3EmbeddingFP16(), allow_dangerous_deserialization=True)
retriever = vectorstore_fp16.as_retriever(search_kwargs={"k": 10})

In [None]:
vectorstore_fp16.index.ntotal

In [None]:
retrieved_documents = retriever.invoke(q_client_without_object)
pprint.pprint(retrieved_documents)

Add a check to immediately disregard questions out of scope, otherwise the model tries to retrieve some documents, but it doesn't make any sense.

# Generative part of the RAG system - generate a response

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Prompt
template_RAG_generation = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template_RAG_generation)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

# Question
response_text = rag_chain.invoke(q_client_without_object)
pprint.pprint(response_text)

In [None]:
domanda = "Come posso impostare una stampante predefinita per il mio utente?"
text = rag_chain.invoke(domanda)
pprint.pprint(text)

In [None]:
domanda = "Come inserisco una nuova attività fatturabile per il mio utente?"
text = rag_chain.invoke(domanda)
pprint.pprint(text)

In [None]:
domanda = "Cosa c'è nella pagina timesheet risorsa?"
text = rag_chain.invoke(domanda)
pprint.pprint(text)

In [None]:
retrieved_documents = retriever.invoke(domanda)
pprint.pprint(retrieved_documents)

In [None]:
display(retrieved_documents[0])

In [None]:
rerank_questions(domanda)