In [None]:
import pandas as pd
import pprint
import transformers

In [None]:
filename_all_data_dict = "./Files/final_dataset.csv"

data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
data_df = data_df.drop(index = 0)
data_df

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import ollama

# Load BERT multilingual model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

In [None]:
# Try on a single document 
# Create a list with all the values in the column 'text'
text_list = data_df['text'].tolist()
text = text_list[0]

# Tokenize the input and generate embeddings
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

# Get the embeddings from the last hidden state
embeddings = outputs.last_hidden_state
# Average the embeddings across the sequence if you want a single vector for the entire input
mean_embeddings = torch.mean(embeddings, dim=1)

# Use Ollama's LLaMA 3.2 model
# Example LLaMA prompt using the embeddings
#prompt = f'Il testo da riassumere è: --{mean_embeddings.numpy().tolist()}--'
prompt = f"""Fornisci un riassunto dettagliato della documentazione fornita. 
    Documentazione:
    {text}"""

response = ollama.generate(model="llama3.2", prompt=prompt, options={"temperature": 0})

# Print the response generated by LLaMA
print("Risposta:", response['response'])

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

template = """Fornisci un riassunto dettagliato della documentazione fornita. 
    Documentazione:: {question}"""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model="llama3.2")

chain = prompt | model

chain.invoke({"question": text})

# Simple RAG

In [None]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(data_df, page_content_column="text")
docs_data = loader.load()
docs_data[0]

In [None]:
# Split
# Possible improvements - future hypertuning of chunk_size and chunk_overlap to improve results and try different slitters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(docs_data)
pprint.pprint(splits[0:6])
pprint.pprint(len(splits))

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-m3")

In [None]:
from FlagEmbedding import BGEM3FlagModel

model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
# Function to generate embeddings using the BERT model
def bert_embed(texts):
    # Tokenize the input text (list of texts)
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=3000)
    
    # Generate embeddings using BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get embeddings from the last hidden state
    embeddings = outputs.last_hidden_state  # Shape: [batch_size, sequence_length, hidden_size]
    
    # Optionally, we average the token embeddings across the sequence to get a single vector for each input
    # mean_embeddings = torch.mean(embeddings, dim=1)  # Shape: [batch_size, hidden_size]
    
    return mean_embeddings.cpu().numpy()  # Convert to numpy array for Chroma

class BertEmbedding:
    def embed_documents(self, texts):
        return model.encode(texts)
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
from langchain_community.vectorstores import FAISS
#vectorstore = FAISS.from_documents(splits, BertEmbedding())
#vectorstore.save_local("local_model_index")

In [None]:
vectorstore = FAISS.load_local("local_model_index", BertEmbedding(), allow_dangerous_deserialization=True)
vectorstore.index.ntotal

In [None]:
# Check the number of vectors stored
faiss_index = vectorstore.index
num_vectors = faiss_index.ntotal
dimension = faiss_index.d
print(f"Number of Vectors: {num_vectors}")
print(f"Dimension of Vectors:{dimension }")

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
question = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione?"
retrieved_documents = retriever.invoke(question)

pprint.pprint(retrieved_documents)

In [None]:
from langchain_ollama import ChatOllama

model_llama = ChatOllama(
    model="llama3.2", 
    temperature=0
)

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

# Question
response_text = rag_chain.invoke(question)
pprint.pprint(response_text)

In [None]:
from langchain_community.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

vectorstore_fp16 = FAISS.load_local("local_model_index", M3EmbeddingFP16(), allow_dangerous_deserialization=True)
vectorstore_fp16.index.ntotal
retriever_fp16 = vectorstore_fp16.as_retriever(search_kwargs={"k": 4})

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

# Chain
rag_chain = (
    {"context": retriever_fp16 | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

# 
question = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione?"
response_text = rag_chain.invoke(question)
pprint.pprint(response_text)

In [None]:
question_out_of_scope = "Quando è morto Giulio Cesare?"
pprint.pprint(rag_chain.invoke(question_out_of_scope))

In [None]:
multiple_questions = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione? Chi è Giulio Cesare?"
pprint.pprint(rag_chain.invoke(multiple_questions))

In [None]:
multiple_valid_questions = "Cosa significa che una fattura è in mancata consegna? Il cliente ha ricevuto la fattura?"
pprint.pprint(rag_chain.invoke(multiple_valid_questions))

In [None]:
retrieved_documents = retriever.invoke(multiple_valid_questions)

pprint.pprint(retrieved_documents)

In [None]:
fastupdate_question = "Che novità ci sono relative al workflow nel fast update 5.0.03?"

pprint.pprint(rag_chain.invoke(fastupdate_question))

In [None]:
retriever.invoke(fastupdate_question)

In [None]:
q_client = "Addebito bollo su nota credito. Su nota credito non mette più addebito bollo: precedente nota credito si."
q_rewritten = "Perché la nota di credito non sta aggiungendo più il bollo e come risolvere questo problema?"

pprint.pprint(rag_chain.invoke(q_rewritten))

# BM25

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [None]:
# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k =  4  # Retrieve top 2 results

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt - we removed a part of the prompt and it seems to give better answers, answering also to parts it didn't answered before
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

# Chain
rag_chain_bm25 = (
    {"context": bm25_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

# Question
response_text = rag_chain_bm25.invoke(question)
pprint.pprint(response_text)

In [None]:
pprint.pprint(multiple_valid_questions)
response_text = rag_chain_bm25.invoke(multiple_valid_questions)
pprint.pprint(response_text)

# Ensemble Retriever - BM25 + Basic RAG

In [None]:
# Initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.4, 0.6])

# Retrieve relevant documents/products
pprint.pprint(question)
docs = ensemble_retriever.get_relevant_documents(question)
pprint.pprint(docs)

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt - we removed a part of the prompt and it seems to give better answers, answering also to parts it didn't answered before
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

# Chain
rag_chain_ensemble = (
    {"context": ensemble_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

# Question
response_text = rag_chain_ensemble.invoke(question)
pprint.pprint(response_text)

In [None]:
pprint.pprint(multiple_valid_questions)
response_text = rag_chain_ensemble.invoke(multiple_valid_questions)
pprint.pprint(response_text)

In [None]:
q_rewritten = "Perché la nota di credito non sta aggiungendo più il bollo e come risolvere questo problema?"
pprint.pprint(rag_chain_ensemble.invoke(q_rewritten))

In [None]:
q_client = "Addebito bollo su nota credito. Su nota credito non mette più addebito bollo: precedente nota credito si."
pprint.pprint(rag_chain_ensemble.invoke(q_client))

In [None]:
question_out_of_scope = "Quando è morto Giulio Cesare?"
pprint.pprint(rag_chain_ensemble.invoke(question_out_of_scope))

In [None]:
multiple_questions = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione? Chi è Giulio Cesare?"
pprint.pprint(rag_chain_ensemble.invoke(multiple_questions))

In [None]:
fastupdate_question = "Che novità ci sono relative al workflow nel fast update 5.0.03?"
pprint.pprint(rag_chain_ensemble.invoke(fastupdate_question))

# Automated system evaluation

## Embedding-Based Similarity

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer("BAAI/bge-m3")

def evaluate_embedding_similarity(query, retrieved_docs, generated_answer):
    # Embed the query, generated answer, and retrieved documents
    query_embedding = model.encode(query).reshape(1, -1)  # Reshape to (1, embedding_size)
    answer_embedding = model.encode(generated_answer).reshape(1, -1)  # Reshape to (1, embedding_size)
    doc_embeddings = [model.encode(doc).reshape(1, -1) for doc in retrieved_docs]  # Reshape each to (1, embedding_size)

    # Compute cosine similarity between query and generated answer
    query_answer_similarity = cosine_similarity(query_embedding, answer_embedding)[0][0]

    # Compute cosine similarity between generated answer and each retrieved document
    doc_answer_similarities = [cosine_similarity(answer_embedding, doc_emb)[0][0] for doc_emb in doc_embeddings]
    
    # Average similarity between generated answer and retrieved documents
    avg_doc_answer_similarity = np.mean(doc_answer_similarities)

    return query_answer_similarity, avg_doc_answer_similarity

# Example usage
docs = ensemble_retriever.get_relevant_documents(question)
retrieved_docs = [context.page_content for context in docs]
generated_answer = rag_chain_ensemble.invoke(question)

# Run the evaluation
query_answer_similarity, avg_doc_answer_similarity = evaluate_embedding_similarity(question, retrieved_docs, generated_answer)

print(f"Query-Answer Similarity: {query_answer_similarity}")
print(f"Average Answer-Documents Similarity: {avg_doc_answer_similarity}")


**Query-Answer Similarity: 0.7175188660621643:**

Strength: The generated answer is likely relevant and on-topic, addressing the core aspect of the query.

Potential Weakness: Some minor discrepancies or nuances in the query may not be fully captured in the answer. For example, the answer might be general or missing specific details from the query, which could be why the similarity is not closer to 1.

**Average Answer-Documents Similarity: 0.5526096224784851**

Strength: The generated answer seems to make use of the retrieved information, but not in a very strong or comprehensive way.

Potential Weakness: The answer may either:
- Use information that is not fully represented in the retrieved documents (potentially introducing hallucinations or unsupported facts).
- Be loosely based on the retrieved documents, but not drawing directly or strongly from the key information within them.

**Possibilities to improve the results:**

Query-Answer Similarity:

Fine-tune the model to generate answers that more precisely match the intent and specifics of the query.
Ensure that the answer directly addresses the main points or information asked in the query.

Answer-Documents Similarity:

Improve document retrieval system to fetch more relevant or diverse documents that better support the generated answer.
Ensure that the model generates answers more closely based on the retrieved content, reducing the chances of hallucinations or answers based on information not found in the documents.

## Self consistency

In [None]:
from difflib import SequenceMatcher

def calculate_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def measure_self_consistency(query, num_trials=10, threshold=0.9):
    generated_answers = []
    
    # Run the retrieval and generation steps multiple times
    for _ in range(num_trials):
        generated_answer = rag_chain_ensemble.invoke(query)
        generated_answers.append(generated_answer)

    # Compare answers with each other
    consistent_pairs = 0
    total_pairs = 0
    for i in range(len(generated_answers)):
        for j in range(i+1, len(generated_answers)):
            total_pairs += 1
            if calculate_similarity(generated_answers[i], generated_answers[j]) > threshold:
                consistent_pairs += 1

    # Consistency score: how many times answers are semantically similar
    consistency_score = consistent_pairs / total_pairs
    return consistency_score

measure_self_consistency(multiple_valid_questions)

# Advanced RAG

## Query rewriting

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an AI language model assistant. Your task is to generate five different versions, in Italian, of the given user question to retrieve relevant documents from a vector database. 
The context of our application is related to Enterprise Resource Planning (ERP) software's technical manuals (specifically Panthera software) or, more generally, topics related to computer science, including system configuration,
module functionality, troubleshooting, and implementation guidelines.
Your goal is to generate multiple perspectives on the question to help the user overcome limitations of distance-based similarity search while focusing strictly on the context of ERP software documentation
or relevant computer science topics.
In cases where the user provides multiple questions, only respond to the relevant ones related to ERP documentation or computer science. Provide these alternative questions separated by newlines.
Before generating alternatives, ensure the user's question is related to ERP technical documentation or relevant computer science topics. 
If any of the questions are out of scope or irrelevant to ERP manuals or computer science topics, disregard them entirely. 
You don't need to ignore all the questions, but only the ones that are out of scope.

Use the ERP context only as information, but do not mention it in the rewritten questions.
Provide the created alternative questions separated by newlines, and structure the output to contain only the rewritten questions in a bullet list.
Output only the bullet list of the rewritten questions.

Original question: {question}
"""

template = """Sei un assistente modello linguistico AI. 
Il tuo compito è generare cinque versioni diverse della domanda fornita dall'utente per recuperare documenti rilevanti da un database vettoriale. 
Il contesto riguarda manuali tecnici di software di Enterprise Resource Planning (ERP).

**Istruzioni:**
1. Genera domande riscritte che mantengano il significato originale, esplorando diverse formulazioni e angolazioni.
2. Ignora le domande che non sono pertinenti ai manuali ERP o agli argomenti di informatica.
3. Fornisci le domande alternative in un elenco puntato separato da nuove righe.
4. L'output deve contenere solo le domande riscritte, senza spiegazioni o commenti.

Domanda originale: {question}
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_perspectives 
    | model_llama
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

question = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione?"
pprint.pprint(question)
rewritten_question = generate_queries.invoke({"question": question})
pprint.pprint(rewritten_question)

In [None]:
pprint.pprint(generate_queries.invoke({"question": multiple_valid_questions}))

In [None]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto. Evita troppe ripetizioni nella risposta fornita.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

rewriting_rag_chain = (
    {"context": itemgetter("context"), #"context" : retrieval_chain
     "question": itemgetter("question")} 
    | prompt
    | model_llama
    | StrOutputParser()
)

pprint.pprint(rewriting_rag_chain.invoke({"context": chain.invoke(question), "question": question}))


In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"question":question_out_of_scope}))

In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"question":multiple_questions}))

In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"question":multiple_valid_questions}))

In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"context": retrieval_chain.invoke(multiple_valid_questions), "question": multiple_valid_questions}))

In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"question":q_rewritten}))

## Reranking

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            # k is a constant smoothing factor that prevents documents from being overly penalized for being far down the list
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results[:3]

reranking_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})
retrieval_chain_rag_fusion = generate_queries | reranking_retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
docs

In [None]:
prompt = ChatPromptTemplate.from_template(template)

rerank_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | model_llama
    | StrOutputParser()
)

pprint.pprint(question)
pprint.pprint(rerank_rag_chain.invoke({"question":question}))

In [None]:
pprint.pprint(question_out_of_scope)
pprint.pprint(rerank_rag_chain.invoke({"question": question_out_of_scope}))

In [None]:
pprint.pprint(multiple_questions)
pprint.pprint(rerank_rag_chain.invoke({"question": multiple_questions}))

In [None]:
pprint.pprint(rerank_rag_chain.invoke({"question":multiple_valid_questions}))