In [None]:
import pandas as pd
import pprint
import transformers

In [None]:
filename_all_data_dict = "./Files/final_dataset.csv"

data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
data_df = data_df.drop(index = 0)
data_df

In [None]:
# Create a list with all the values in the column 'text'
text_list = data_df['text'].tolist()

pprint.pprint(text_list[0])

In [None]:
from transformers import GPT2Tokenizer

# Load the tokenizer
# Use this funciton to check for any tokenizer we decide to use
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def count_tokens_in_documents(documents, token_limit=8192):
    """
    Count the tokens in a list of documents and check if each document exceeds the token limit.

    Args:
        documents (list of str): List of documents (strings) to process.
        token_limit (int): Maximum allowed tokens per document. Defaults to 8192.
    
    Returns:
        dict: A dictionary containing the document index, token count, and a message
              indicating if it exceeds the token limit.
    """
    token_counts = []
    count = 0
    
    # Process each document in the list
    for idx, doc in enumerate(documents):
        # Tokenize the document
        tokens = tokenizer.tokenize(doc)
        num_tokens = len(tokens)
        
        # Check if the document exceeds the token limit
        if num_tokens > token_limit:
            message = f"Document {idx + 1} has {num_tokens} tokens, exceeding the limit of {token_limit} tokens."
            count += 1
        else:
            message = f"Document {idx + 1} has {num_tokens} tokens, within the limit."
        
        # Append the result to the list
        token_counts.append({
            'document_index': idx + 1,
            'token_count': num_tokens,
            'message': message
        })
    
    return token_counts, count

token_info, count = count_tokens_in_documents(data_df['text'])

# Print the result for each document
print(f"On a total of {data_df.shape[0]} documents, we have {count} documents, that exceed the maximum token limit.")
for info in token_info:
    print(info['message'])


In [None]:
from openai import OpenAI
import os
import tiktoken

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
client = OpenAI()
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")

def chunk_text(text, max_tokens=8000):
    tokens = tokenizer.encode(text)
    chunks = []
    
    # Split into chunks that are within the max token limit
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    
    return chunks

def get_embedding_for_long_text(text, model="text-embedding-ada-002", max_tokens=8000):
    chunks = chunk_text(text, max_tokens)
    embeddings = [client.embeddings.create(input=[chunk], model=model).data[0].embedding for chunk in chunks]
    
    # You can aggregate these embeddings, e.g., by averaging
    avg_embedding = [sum(x)/len(x) for x in zip(*embeddings)]
    
    return avg_embedding

data_df['embedding'] = data_df['text'].apply(lambda x: get_embedding_for_long_text(x))
data_df

In [None]:
def generate_text_with_embeddings(user_prompt, embedding_context, max_tokens=1024):
    # Combine user prompt with relevant information from embeddings
    prompt = f"Contesto: {embedding_context}\n\n Prompt dell'user: {user_prompt}\n\n Risposta:"
    
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "user", "content": prompt},
            ],
            temperature=0,
            max_tokens=max_tokens
        )

        return response.choices[0].message.content
    
    except Exception as e:
        print(f"Error generating text: {str(e)}")
        return None

In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    return dot_product / (norm_a * norm_b)

def find_most_similar_embedding(user_input, embeddings_df, model="text-embedding-ada-002"):
    user_embedding = get_embedding_for_long_text(user_input, model=model)
    similarities = []
    
    for index, row in embeddings_df.iterrows():
        embedding = row['embedding']  # Assuming the embeddings are stored as lists
        similarity = cosine_similarity(user_embedding, embedding)
        similarities.append((index, similarity))

    most_similar_index, highest_similarity = max(similarities, key=lambda x: x[1])
    return embeddings_df.iloc[most_similar_index], highest_similarity

user_input = "Cosa è un caricamento di massa per panthera? E come si può effettuare?"
most_similar_embedding, similarity_score = find_most_similar_embedding(user_input, data_df)

generated_text = generate_text_with_embeddings(user_input, most_similar_embedding)
print("Given the input: ", user_input)
print("Generated Text:", generated_text)

In [None]:
import numpy as np
from sklearn.cluster import KMeans

matrix = np.vstack(data_df.embedding.values)
n_clusters = 5

kmeans = KMeans(n_clusters = n_clusters, init='k-means++', random_state=42)
kmeans.fit(matrix)
data_df['cluster'] = kmeans.labels_

In [None]:
data_df

In [None]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)

x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

for category, color in enumerate(["purple", "green", "red", "blue", 'gray']):
    xs = np.array(x)[data_df.cluster == category]
    ys = np.array(y)[data_df.cluster == category]
    plt.scatter(xs, ys, color=color, alpha=0.3)

    avg_x = xs.mean()
    avg_y = ys.mean()

    plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
plt.title("Clusters identified visualized in language 2d using t-SNE")


In [None]:
from openai import OpenAI
import pandas as pd

# Initialize the OpenAI client
client = OpenAI()

# Reading a review which belongs to each group.
rev_per_cluster = 5
max_tokens_per_review = 1024  # Limit characters per review

for i in range(n_clusters):
    print(f"Cluster {i} Theme:", end=" ")

    # Sample reviews and truncate each review if necessary
    reviews = data_df['text'][data_df.cluster == i].replace("Title: ", "").replace("\n\nContent: ", ":  ")
    
    # Sample reviews while ensuring the maximum character count is respected
    sampled_reviews = reviews.sample(rev_per_cluster, random_state=42).values
    truncated_reviews = [review[:max_tokens_per_review] for review in sampled_reviews]

    reviews_text = "\n".join(truncated_reviews)

    messages = [
        {"role": "user", "content": f'Cosa hanno in comune i seguenti documenti?\n\nDocumenti Panthera:\n"""\n{reviews_text}\n"""\n\nTema:'}
    ]

    # Check the token count before making the request
    num_tokens = sum(len(review.split()) for review in truncated_reviews) + len(messages[0]['content'].split())
    
    if num_tokens > 8192:
        print(f"Warning: Message exceeds maximum token length: {num_tokens} tokens. Reduce review count or length.")
        continue

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=0,
        max_tokens=512,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    
    print(response.choices[0].message.content.replace("\n", ""))

    sample_cluster_rows = data_df[data_df.cluster == i].sample(rev_per_cluster, random_state=42)

    for j in range(rev_per_cluster):
        print(sample_cluster_rows.file.values[j], ':', sample_cluster_rows.text.str[:100].values[j])

    print("-" * 100)

# Cross validation on cluster number and application on best clustering

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

matrix = np.vstack(data_df.embedding.values)

# Define the range of n_clusters to try
cluster_range = range(5, 16)  # Adjust this range based on your dataset
silhouette_scores = []

# Loop through each value of n_clusters
for n_clusters in cluster_range:
    # Initialize and fit KMeans
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    kmeans.fit(matrix)
    
    # Compute silhouette score
    score = silhouette_score(matrix, kmeans.labels_)
    silhouette_scores.append(score)
    print(f'For n_clusters = {n_clusters}, silhouette score = {score:.4f}')

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.title('Silhouette Score vs. Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(cluster_range)
plt.grid()
plt.show()

# Determine the best n_clusters based on the highest silhouette score
best_n_clusters = cluster_range[np.argmax(silhouette_scores)]
print(f'Best number of clusters: {best_n_clusters}')

In [None]:
import numpy as np
from sklearn.cluster import KMeans

matrix = np.vstack(data_df.embedding.values)
n_clusters = 7

kmeans = KMeans(n_clusters = n_clusters, init='k-means++', random_state=42)
kmeans.fit(matrix)
data_df['cluster'] = kmeans.labels_

In [None]:
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt

# Assuming 'matrix' and 'data_df' are already defined
# Perform t-SNE
tsne = TSNE(n_components= 2, perplexity=15, random_state=42, init="pca", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)

# Extract the x and y coordinates from the t-SNE result
x = vis_dims2[:, 0]
y = vis_dims2[:, 1]

# Determine unique clusters in the data
unique_clusters = np.unique(data_df.cluster)
n_clusters = len(unique_clusters)  # Get the number of unique clusters

colors = plt.cm.get_cmap("tab10", n_clusters)  # You can change 'tab10' to other colormaps

# Plot each cluster
for category in unique_clusters:
    xs = x[data_df.cluster == category]
    ys = y[data_df.cluster == category]
    
    plt.scatter(xs, ys, color=colors(category), alpha=0.3)

    # Compute and plot the average position for each cluster
    avg_x = xs.mean()
    avg_y = ys.mean()
    plt.scatter(avg_x, avg_y, marker="x", color=colors(category), s=100)

plt.title("Clusters identified visualized in 2D using t-SNE")
plt.xlabel("t-SNE component 1")
plt.ylabel("t-SNE component 2")
plt.grid()
plt.show()


In [None]:
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # Import 3D plotting toolkit

# Assuming 'matrix' and 'data_df' are already defined
# Perform t-SNE
tsne = TSNE(n_components=3, perplexity=15, random_state=42, init="pca", learning_rate=200)
vis_dims3 = tsne.fit_transform(matrix)  # Get 3D embeddings

# Extract the x, y, and z coordinates from the t-SNE result
x = vis_dims3[:, 0]
y = vis_dims3[:, 1]
z = vis_dims3[:, 2]

# Determine unique clusters in the data
unique_clusters = np.unique(data_df.cluster)
n_clusters = len(unique_clusters)  # Get the number of unique clusters

# Create a color map for clusters
colors = plt.cm.get_cmap("tab10", n_clusters)  # You can change 'tab10' to other colormaps

# Create a 3D plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot each cluster
for category in unique_clusters:
    xs = x[data_df.cluster == category]
    ys = y[data_df.cluster == category]
    zs = z[data_df.cluster == category]
    
    ax.scatter(xs, ys, zs, color=colors(category), alpha=0.5, label=f'Cluster {category}')

    # Compute and plot the average position for each cluster
    avg_x = xs.mean()
    avg_y = ys.mean()
    avg_z = zs.mean()
    ax.scatter(avg_x, avg_y, avg_z, marker="x", color=colors(category), s=100)

# Set titles and labels
ax.set_title("Clusters identified visualized in 3D using t-SNE")
ax.set_xlabel("t-SNE Component 1")
ax.set_ylabel("t-SNE Component 2")
ax.set_zlabel("t-SNE Component 3")
ax.legend()  # Show legend for clusters
plt.show()


In [None]:
from openai import OpenAI
import pandas as pd

# Initialize the OpenAI client
client = OpenAI()

# Reading a review which belongs to each group.
rev_per_cluster = 5
max_tokens_per_review = 1024  # Limit characters per review

for i in range(n_clusters):
    print(f"Cluster {i} Theme:", end=" ")

    # Sample reviews and truncate each review if necessary
    reviews = data_df['text'][data_df.cluster == i].replace("Title: ", "").replace("\n\nContent: ", ":  ")
    
    # Sample reviews while ensuring the maximum character count is respected
    sampled_reviews = reviews.sample(rev_per_cluster, random_state=42).values
    truncated_reviews = [review[:max_tokens_per_review] for review in sampled_reviews]

    reviews_text = "\n".join(truncated_reviews)

    messages = [
        {"role": "user", "content": f'Cosa hanno in comune i seguenti documenti?\n\nDocumenti Panthera:\n"""\n{reviews_text}\n"""\n\nTema:'}
    ]

    # Check the token count before making the request
    num_tokens = sum(len(review.split()) for review in truncated_reviews) + len(messages[0]['content'].split())
    
    if num_tokens > 8192:
        print(f"Warning: Message exceeds maximum token length: {num_tokens} tokens. Reduce review count or length.")
        continue

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=0,
        max_tokens=512,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    
    print(response.choices[0].message.content.replace("\n", ""))

    sample_cluster_rows = data_df[data_df.cluster == i].sample(rev_per_cluster, random_state=42)

    for j in range(rev_per_cluster):
        print(sample_cluster_rows.file.values[j], ':', sample_cluster_rows.text.str[:100].values[j])

    print("-" * 100)

In [None]:
# Settings
rev_per_cluster = 5  # Reviews to sample per cluster
max_tokens_per_review = 1024  # Limit characters per review
max_tokens_per_prompt = 8192  # GPT-4 max token limit
output_text = ""

# Iterate over clusters
for i in range(n_clusters):
    output_text += f"### Cluster {i} Theme:\n"
    
    # Get reviews for the current cluster
    reviews = data_df['text'][data_df.cluster == i].replace("Title: ", "").replace("\n\nContent: ", ":  ")
    sampled_reviews = reviews.sample(rev_per_cluster, random_state=42).values
    
    # Truncate reviews to respect max token limits
    truncated_reviews = [review[:max_tokens_per_review] for review in sampled_reviews]
    reviews_text = "\n".join(truncated_reviews)
    
    # Prepare the GPT prompt
    messages = [
        {
            "role": "user",
            "content": f'Cosa hanno in comune i seguenti documenti?\n\nDocumenti Panthera:\n"""\n{reviews_text}\n"""\n\nTema:'
        }
    ]
    
    # Calculate token count
    num_tokens = sum(len(review.split()) for review in truncated_reviews) + len(messages[0]['content'].split())
    
    if num_tokens > max_tokens_per_prompt:
        print(f"Warning: Message exceeds maximum token length: {num_tokens} tokens. Reduce review count or length.")
        continue
    
    # Get theme description using GPT-4
    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=0,
        max_tokens=512,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    
    theme_description = response.choices[0].message.content.strip().replace("\n", " ")
    output_text += theme_description + "\n\n"

    all_cluster_rows = data_df[data_df.cluster == i]
    
    output_text += "#### Document Titles (All Documents in Cluster):\n"
    for title in all_cluster_rows.file.values:
        output_text += f"- {title}\n"
    
    output_text += "\n" + "-" * 100 + "\n\n"

# Write the output text to a file
with open('clustered_documents_summary.txt', 'w') as f:
    f.write(output_text)

print("Document created: 'clustered_documents_summary.txt'")

# Trials

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(data_df[['text']])
dataset

In [None]:
from transformers import MBartForConditionalGeneration, MBartTokenizer

# Load MBart model and tokenizer
model_name = 'facebook/mbart-large-50-many-to-many-mmt'
#"NousResearch/Llama-3.2-1B"

tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [None]:
docs = data_df['text'].tolist()

from sentence_transformers import SentenceTransformer
import faiss

# Load Sentence Transformer model
retriever_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Generate embeddings for all the documents (texts from your df_data['text'])
doc_embeddings = retriever_model.encode(docs)

# Build a FAISS index for fast similarity search
index = faiss.IndexFlatL2(doc_embeddings.shape[1])  # Use L2 distance
index.add(doc_embeddings)  # Add the document embeddings to the index

In [None]:
def retrieve_docs(query, top_k=5):
    # Encode the query using the same model
    query_embedding = retriever_model.encode([query])

    # Perform the search, retrieving the top_k relevant documents
    distances, indices = index.search(query_embedding, top_k)

    # Return the relevant document texts and their corresponding files
    retrieved_texts = [docs[i] for i in indices[0]]
    retrieved_files = [data_df['file'].iloc[i] for i in indices[0]]  # File names
    return retrieved_texts, retrieved_files

# Example usage
query = "Come funziona il fast update in Panthera?"
retrieved_texts, retrieved_files = retrieve_docs(query)
pprint.pprint(f"Documenti rilevanti: {retrieved_files}")
pprint.pprint(f"Contenuto estratto: {retrieved_texts}")

In [None]:
def generate_answer(query, retrieved_texts):
    # Combine the user query and the retrieved documents
    input_text = query + " " + " ".join(retrieved_texts)  # Limit to top 3 texts for context
    
    # Tokenize and generate the response
    tokenizer.src_lang = "it_IT"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate the response with improved generation settings
    output = model.generate(**inputs, 
                            forced_bos_token_id=tokenizer.lang_code_to_id["it_IT"],
                            max_length=100, 
                            num_beams=2, 
                            length_penalty=2.0, 
                            no_repeat_ngram_size=3, 
                            top_k=50, 
                            top_p=0.95, 
                            temperature=0.2)
    
    # Decode and return the generated response
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
answer = generate_answer(query, retrieved_texts)
pprint.pprint(f"Risposta: {answer}")


# RAG on a single cluster

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [None]:
from langchain_community.document_loaders import DataFrameLoader

cluster = pd.DataFrame(data_df[data_df.cluster == 0]['text'])
cluster['file'] = data_df[data_df.cluster == 0]['file']
loader = DataFrameLoader(cluster, page_content_column="text")
docs = loader.load()

In [None]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=600)
splits = text_splitter.split_documents(docs)
pprint.pprint(splits[0])

In [None]:
# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [None]:
# Index
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
question = "Come funziona la gestione delle finestre modali in Chrome?"
docs = retriever.get_relevant_documents(question)
docs

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente. 
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se non sai rispondere, di soltanto che non detieni queste informationi e che è necessario contattare l'assistenza del helpdesk.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:
# LLM - the used model
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
response_text = rag_chain.invoke(question)
pprint.pprint(response_text)

In [None]:
question_out_of_scope = "Come posso trovare informazioni su Giulio Cesare dalle finestre modali di Chrome?"
pprint.pprint(rag_chain.invoke(question_out_of_scope))

# Advanced RAG pipeline

## Query translation - Multi query - rewriting of the query in 5 different ways

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an AI language model assistant. Your task is to generate five different versions, in Italian, of the given user question to retrieve relevant documents from a vector database. 
The context of our application is related to Enterprise Resource Planning (ERP) software's technical manuals (specifically Panthera software) or, more generally, topics related to computer science, including system configuration,
module functionality, troubleshooting, and implementation guidelines.
Your goal is to generate multiple perspectives on the question to help the user overcome limitations of distance-based similarity search while focusing strictly on the context of ERP software documentation
or relevant computer science topics.
In cases where the user provides multiple questions, only respond to the relevant ones related to ERP documentation or computer science. Provide these alternative questions separated by newlines.
Before generating alternatives, ensure the user's question is related to ERP technical documentation or relevant computer science topics. 
If any of the questions are out of scope or irrelevant to ERP manuals or computer science topics, disregard them entirely. 
You don't need to ignore all the questions, but only the ones that are out of scope.
Provide the created alternative questions separated by newlines, and structure the output to contain only the rewritten questions in a bullet list.

Original question: {question}
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
generate_queries.invoke({"question": question})

In [None]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
docs

In [None]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente. 
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuaziore l'azione desiderata. 

Se la domanda non è inerente al contesto della documentazione, rispondi soltanto che la domanda non è pertinente al contesto. 
Non rispondere mai alle domande non inerenti alla documentazione. Rispondere solamente che: "La domanda non è pertinente in questo contesto."

Se non sai rispondere alle domande relative alla documentazione, rispondi soltanto che non detieni queste informationi e che è necessario contattare l'assistenza del helpdesk.
In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

pprint.pprint(final_rag_chain.invoke({"question":question}))

In [None]:
question_out_of_scope = "Entro quando devo liquidare l'iva in Italia?"
pprint.pprint(final_rag_chain.invoke({"question":question_out_of_scope}))

In [None]:
question1 = "Come funziona la gestione delle finestre modali in Chrome?"
pprint.pprint(final_rag_chain.invoke({"question" : question1}))

In [None]:
question_out_of_scope = "Come funziona la gestione delle finestre modali in Chrome? Chi è Giulio Cesare? "
pprint.pprint(final_rag_chain.invoke({"question":question_out_of_scope}))

In [None]:
questions_multiple = "Come funziona la gestione delle finestre modali in Chrome? E come posso fare il rinnovo dei certificati dello sdi? "
pprint.pprint(final_rag_chain.invoke({"question":questions_multiple}))

## Query translation - RAG Fusion - rerank the document

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

In [None]:
prompt = ChatPromptTemplate.from_template(template)

rerank_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

pprint.pprint(rerank_rag_chain.invoke({"question":question}))

# Query translation - Decomposition and recursive answering

In [None]:
from langchain.prompts import ChatPromptTemplate

# Decomposition
template = """Sei un assistente utile che genera più sotto-domande relative a una domanda di input. \n
L'obiettivo è suddividere l'input in un insieme di sotto-problemi che possono essere risolti isolatamente, nel contesto del software Panthera. \n
Genera più query di ricerca relative a: {question} \n
Output (3 query):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# LLM
llm = ChatOpenAI(temperature=0)

# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

# Run
questions = generate_queries_decomposition.invoke({"question":question})

In [None]:
pprint.pprint(question)
pprint.pprint(questions)

In [None]:
template = """Ecco la domanda a cui devi rispondere:

\n --- \n {question} \n --- \n

Ecco le eventuali coppie di domande + risposte disponibili:

\n --- \n {q_a_pairs} \n --- \n

Ecco il contesto aggiuntivo rilevante per la domanda:

\n --- \n {context} \n --- \n

Usa il contesto sopra e le coppie di domande + risposte per rispondere alla domanda: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [None]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Formato coppia domanda e risposta: """
    
    formatted_string = ""
    formatted_string += f"Domanda: {question}\nRisposta: {answer}\n\n"
    return formatted_string.strip()

# llm
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

q_a_pairs = ""
for q in questions:
    
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

pprint.pprint(answer)

Not recursive answering

In [None]:
def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """RAG per ogni sotto-domanda"""
    
    # Use our decomposition / 
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    
    # Initialize a list to hold RAG chain results
    rag_results = []
    
    for sub_question in sub_questions:
        
        # Retrieve documents for each sub-question
        retrieved_docs = retriever.get_relevant_documents(sub_question)
        
        # Use retrieved documents and sub-question in RAG chain
        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                "question": sub_question})
        rag_results.append(answer)
    
    return rag_results,sub_questions

# Decomposition
template = """
Comportati come un assistente che risponde alle domande del cliente. 
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se non sai rispondere, di soltanto che non detieni queste informationi e che è necessario contattare l'assistenza del helpdesk.

Domanda relativa al software Panthera: {question}
"""
prompt_rag = ChatPromptTemplate.from_template(template)
# Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

In [None]:
def format_qa_pairs(questions, answers):
    """Formato coppia domdanda e risposta:"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Domanda {i}: {question}\nRisposta {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

# Prompt
template = """Ecco un set di domande e risposte sull'argomento:

{context}

Usale per rispondere alla domanda: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

pprint.pprint(final_rag_chain.invoke({"context":context,"question":question}))

Other alternatives of question translation, that could be implemented, are: 

- Step back prompting: giving some few shot examples of abstraction of the given query, it takes the question of the user and rewrites it in a more abstract way, enlarging the context and the response - but, I think this only confuses the model and doesn't give an on point answer to the customer
    
- HyDE: giving the question in asks to a model, accurately prompting it, to create a document for the question, and this hypothetical document is then compared to the documents retrieved from the dataset, to get the most similar ones, based on which we generate the response - but, given that the subject of interest is very specific to our software, the creation of the hypothetical document is difficult and could confuse the model

Other possible useful extensions:

- Logical Routing (to different clusters), saved in different vector stores, so route an user query to the most relevant data source (one per cluster)

- Semantic Routing, based on similarity of the query to the prompting, differentiating it based on the cluster 

- Query structuring, using the metadata (ex. version of Panthera) with different vectorestores, and the metadata could be used only if explicitly specified, or if extracted every time from the user request

# Indexing

## Indexing - Multi representation Indexing

In [None]:
from langchain_community.document_loaders import DataFrameLoader

cluster = pd.DataFrame(data_df[data_df.cluster == 0]['text'])
cluster['file'] = data_df[data_df.cluster == 0]['file']
loader = DataFrameLoader(cluster, page_content_column="text")
docs = loader.load()

In [None]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Riassumi dettagliatamente i seguenti documenti:\n\n{doc}")
    | ChatOpenAI(model="gpt-3.5-turbo",max_retries=0)
    | StrOutputParser()
)

summaries = chain.batch(docs, {"max_concurrency": 5})

In [None]:
import uuid
from langchain_core.documents import Document
from langchain.storage import InMemoryByteStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries",
                     embedding_function=OpenAIEmbeddings())

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

# Docs linked to summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# Add
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
sub_docs = vectorstore.similarity_search(query,k=3)
sub_docs