# Imports

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Load data

In [None]:
import pandas as pd
import pprint
import transformers

In [None]:
filename_all_data_dict = "./Files/final_dataset.csv"

data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
data_df = data_df.drop(index = 0)
data_df

# RAG

In [None]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(data_df, page_content_column="text")
docs_data = loader.load()
docs_data[0]

# Choosing the dimension of chunks

In [None]:
import tiktoken
import matplotlib.pyplot as plt

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Calculate the number of tokens for each document
docs_text = [d.page_content for d in docs_data]
counts = [num_tokens_from_string(d, "cl100k_base") for d in docs_text]
print(len(counts))

# Plotting the histogram of token counts
plt.figure(figsize=(10, 6))
plt.hist(counts, bins=30, color="blue", edgecolor="black", alpha=0.7)
plt.title("Histogram of Token Counts")
plt.xlabel("Token Count")
plt.ylabel("Frequency")
plt.grid(axis="y")

# Display the histogram
plt.show

Most documents have fewer than 10000 tokens. 

We could choose chunk size based on the majority of documents, fow which the chunk size can be between 2000 and 5000, so in this way most documents are chunked into manageable sizes without excessive fragmentation. We could use chunk size of 4096, a token size input accepted by some good models (ex. GPT 3.5). Also, given that this is technical documentation it is better to include some longer context from a chunk to the other: we first try with an overlap of 600, that can then be extended if the results are not satisfying.

In [None]:
# Split
# Possible improvements - future hypertuning of chunk_size and chunk_overlap to improve results and try different slitters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=600)
splits = text_splitter.split_documents(docs_data)
pprint.pprint(splits[0:6])
pprint.pprint(len(splits))

# Basic RAG

In [None]:
# Embed - if we need more efficiency try different databases in which to save the documents and index them
# Possible improvements - try different embeddings
embd = OpenAIEmbeddings()
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(documents=splits, embedding=embd)
vectorstore.save_local("faiss_index")

In [None]:
new_vectorstore = FAISS.load_local("faiss_index", embd, allow_dangerous_deserialization=True)
new_vectorstore

In [None]:
new_vectorstore.index.ntotal

In [None]:
# Index
# Data driven changes - change 'k' the number of retrieved documents given the query
retriever = new_vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
question = "Come funziona la gestione delle finestre modali in Chrome?"
question = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione?"
docs = retriever.get_relevant_documents(question)
docs

In [None]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:
# LLM - the used model
model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# max_token

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Question
pprint.pprint(rag_chain.invoke(question))

In [None]:
question_out_of_scope = "Quando è morto Giulio Cesare?"
pprint.pprint(rag_chain.invoke(question_out_of_scope))

In [None]:
multiple_questions = "Come funziona la gestione delle finestre modali in Chrome? Chi è Giulio Cesare?"
pprint.pprint(rag_chain.invoke(multiple_questions))

In [None]:
multiple_valid_questions = "Cosa significa che una fattura è in mancata consegna? Il cliente ha ricevuto la fattura?"
pprint.pprint(rag_chain.invoke(multiple_valid_questions))

In [None]:
fastupdate_question = "Che novità ci sono relative al workflow nel fast update 5.0.03?"
pprint.pprint(rag_chain.invoke(fastupdate_question))

In [None]:
q = "Perché la nota di credito non sta aggiungendo più il bollo e come risolvere questo problema?"
pprint.pprint(rag_chain.invoke(q))

# Advanced RAG

## Query rewriting

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an AI language model assistant. Your task is to generate five different versions, in Italian, of the given user question to retrieve relevant documents from a vector database. 
The context of our application is related to Enterprise Resource Planning (ERP) software's technical manuals (specifically Panthera software) or, more generally, topics related to computer science, including system configuration,
module functionality, troubleshooting, and implementation guidelines.
Your goal is to generate multiple perspectives on the question to help the user overcome limitations of distance-based similarity search while focusing strictly on the context of ERP software documentation
or relevant computer science topics.
In cases where the user provides multiple questions, only respond to the relevant ones related to ERP documentation or computer science. Provide these alternative questions separated by newlines.
Before generating alternatives, ensure the user's question is related to ERP technical documentation or relevant computer science topics. 
If any of the questions are out of scope or irrelevant to ERP manuals or computer science topics, disregard them entirely. 
You don't need to ignore all the questions, but only the ones that are out of scope.
Provide the created alternative questions separated by newlines, and structure the output to contain only the rewritten questions in a bullet list.

Original question: {question}
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
generate_queries.invoke({"question": question})

In [None]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
docs

In [None]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

rewriting_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

pprint.pprint(question)
pprint.pprint(rewriting_rag_chain.invoke({"question":question}))

In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"question":question_out_of_scope}))

In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"question":multiple_questions})) #Error with too long context lenth if k => 4 (try re-ranking and selecting the best ones)

In [None]:
generate_queries.invoke({"question": multiple_valid_questions})

In [None]:
pprint.pprint(rewriting_rag_chain.invoke({"question":multiple_valid_questions}))

## Rerank the documents

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            # k is a constant smoothing factor that prevents documents from being overly penalized for being far down the list
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results[:3]

reranking_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})
retrieval_chain_rag_fusion = generate_queries | reranking_retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
docs

In [None]:
prompt = ChatPromptTemplate.from_template(template)

rerank_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

pprint.pprint(question)
pprint.pprint(rerank_rag_chain.invoke({"question":question}))

In [None]:
pprint.pprint(rerank_rag_chain.invoke({"question":question_out_of_scope}))

In [None]:
pprint.pprint(rerank_rag_chain.invoke({"question":multiple_questions}))

# Raptor Indexing

In [None]:
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import umap
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from sklearn.mixture import GaussianMixture

RANDOM_SEED = 224  # Fixed seed for reproducibility
embd = OpenAIEmbeddings()

### --- Code from citations referenced above (added comments and docstrings) --- ###


def global_cluster_embeddings(
    embeddings: np.ndarray,
    dim: int,
    n_neighbors: Optional[int] = None,
    metric: str = "cosine",
) -> np.ndarray:
    """
    Perform global dimensionality reduction on the embeddings using UMAP.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for the reduced space.
    - n_neighbors: Optional; the number of neighbors to consider for each point.
                   If not provided, it defaults to the square root of the number of embeddings.
    - metric: The distance metric to use for UMAP.

    Returns:
    - A numpy array of the embeddings reduced to the specified dimensionality.
    """
    if n_neighbors is None:
        n_neighbors = int((len(embeddings) - 1) ** 0.5)
    return umap.UMAP(
        n_neighbors=n_neighbors, n_components=dim, metric=metric
    ).fit_transform(embeddings)


def local_cluster_embeddings(
    embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine"
) -> np.ndarray:
    """
    Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for the reduced space.
    - num_neighbors: The number of neighbors to consider for each point.
    - metric: The distance metric to use for UMAP.

    Returns:
    - A numpy array of the embeddings reduced to the specified dimensionality.
    """
    return umap.UMAP(
        n_neighbors=num_neighbors, n_components=dim, metric=metric
    ).fit_transform(embeddings)


def get_optimal_clusters(
    embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED
) -> int:
    """
    Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - max_clusters: The maximum number of clusters to consider.
    - random_state: Seed for reproducibility.

    Returns:
    - An integer representing the optimal number of clusters found.
    """
    max_clusters = min(max_clusters, len(embeddings))
    n_clusters = np.arange(1, max_clusters)
    bics = []
    for n in n_clusters:
        gm = GaussianMixture(n_components=n, random_state=random_state)
        gm.fit(embeddings)
        bics.append(gm.bic(embeddings))
    return n_clusters[np.argmin(bics)]


def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):
    """
    Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - threshold: The probability threshold for assigning an embedding to a cluster.
    - random_state: Seed for reproducibility.

    Returns:
    - A tuple containing the cluster labels and the number of clusters determined.
    """
    n_clusters = get_optimal_clusters(embeddings)
    gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
    gm.fit(embeddings)
    probs = gm.predict_proba(embeddings)
    labels = [np.where(prob > threshold)[0] for prob in probs]
    return labels, n_clusters


def perform_clustering(
    embeddings: np.ndarray,
    dim: int,
    threshold: float,
) -> List[np.ndarray]:
    """
    Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering
    using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for UMAP reduction.
    - threshold: The probability threshold for assigning an embedding to a cluster in GMM.

    Returns:
    - A list of numpy arrays, where each array contains the cluster IDs for each embedding.
    """
    if len(embeddings) <= dim + 1:
        # Avoid clustering when there's insufficient data
        return [np.array([0]) for _ in range(len(embeddings))]

    # Global dimensionality reduction
    reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)
    # Global clustering
    global_clusters, n_global_clusters = GMM_cluster(
        reduced_embeddings_global, threshold
    )

    all_local_clusters = [np.array([]) for _ in range(len(embeddings))]
    total_clusters = 0

    # Iterate through each global cluster to perform local clustering
    for i in range(n_global_clusters):
        # Extract embeddings belonging to the current global cluster
        global_cluster_embeddings_ = embeddings[
            np.array([i in gc for gc in global_clusters])
        ]

        if len(global_cluster_embeddings_) == 0:
            continue
        if len(global_cluster_embeddings_) <= dim + 1:
            # Handle small clusters with direct assignment
            local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
            n_local_clusters = 1
        else:
            # Local dimensionality reduction and clustering
            reduced_embeddings_local = local_cluster_embeddings(
                global_cluster_embeddings_, dim
            )
            local_clusters, n_local_clusters = GMM_cluster(
                reduced_embeddings_local, threshold
            )

        # Assign local cluster IDs, adjusting for total clusters already processed
        for j in range(n_local_clusters):
            local_cluster_embeddings_ = global_cluster_embeddings_[
                np.array([j in lc for lc in local_clusters])
            ]
            indices = np.where(
                (embeddings == local_cluster_embeddings_[:, None]).all(-1)
            )[1]
            for idx in indices:
                all_local_clusters[idx] = np.append(
                    all_local_clusters[idx], j + total_clusters
                )

        total_clusters += n_local_clusters

    return all_local_clusters


### --- Langchain code below --- ###

def embed(texts):
    """
    Generate embeddings for a list of text documents.

    This function assumes the existence of an `embd` object with a method `embed_documents`
    that takes a list of texts and returns their embeddings.

    Parameters:
    - texts: List[str], a list of text documents to be embedded.

    Returns:
    - numpy.ndarray: An array of embeddings for the given text documents.
    """
    text_embeddings = embd.embed_documents(texts)
    text_embeddings_np = np.array(text_embeddings)
    return text_embeddings_np


def embed_cluster_texts(texts):
    """
    Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.

    This function combines embedding generation and clustering into a single step. It assumes the existence
    of a previously defined `perform_clustering` function that performs clustering on the embeddings.

    Parameters:
    - texts: List[str], a list of text documents to be processed.

    Returns:
    - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.
    """
    text_embeddings_np = embed(texts)  # Generate embeddings
    cluster_labels = perform_clustering(
        text_embeddings_np, 10, 0.1
    )  # Perform clustering on the embeddings
    df = pd.DataFrame()  # Initialize a DataFrame to store the results
    df["text"] = texts  # Store original texts
    df["embd"] = list(text_embeddings_np)  # Store embeddings as a list in the DataFrame
    df["cluster"] = cluster_labels  # Store cluster labels
    return df


def fmt_txt(df: pd.DataFrame) -> str:
    """
    Formats the text documents in a DataFrame into a single string.

    Parameters:
    - df: DataFrame containing the 'text' column with text documents to format.

    Returns:
    - A single string where all text documents are joined by a specific delimiter.
    """
    unique_txt = df["text"].tolist()
    return "--- --- \n --- --- ".join(unique_txt)


def embed_cluster_summarize_texts(
    texts: List[str], level: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts,
    clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes
    the content within each cluster.

    Parameters:
    - texts: A list of text documents to be processed.
    - level: An integer parameter that could define the depth or detail of processing.

    Returns:
    - Tuple containing two DataFrames:
      1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.
      2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,
         and the cluster identifiers.
    """

    # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns
    df_clusters = embed_cluster_texts(texts)

    # Prepare to expand the DataFrame for easier manipulation of clusters
    expanded_list = []

    # Expand DataFrame entries to document-cluster pairings for straightforward processing
    for index, row in df_clusters.iterrows():
        for cluster in row["cluster"]:
            expanded_list.append(
                {"text": row["text"], "embd": row["embd"], "cluster": cluster}
            )

    # Create a new DataFrame from the expanded list
    expanded_df = pd.DataFrame(expanded_list)

    # Retrieve unique cluster identifiers for processing
    all_clusters = expanded_df["cluster"].unique()

    print(f"--Generated {len(all_clusters)} clusters--")

    # Summarization
    template = """Fornisci un riassunto dettagliato della documentazione fornita. 
    Documentazione:
    {context}"""
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model | StrOutputParser()

    # Format text within each cluster for summarization
    summaries = []
    cluster_labels = []
    for i in all_clusters:
        df_cluster = expanded_df[expanded_df["cluster"] == i]
        formatted_txt = fmt_txt(df_cluster)

        # Personal addition to respect the model's token limit 
        max_tokens = 4000  # or whatever the limit is for your model
        if len(formatted_txt.split()) > max_tokens:
           formatted_txt = ' '.join(formatted_txt.split()[:max_tokens])  # Truncate
        
        # Split - compare splitting and truncation results
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=600)
        formatted_txt_split = text_splitter.split_text(formatted_txt)
        
        #for txt in formatted_txt_split:
        #    summaries.append(chain.invoke({"context": txt}))

        # Summarize each chunk and combine all summaries for this cluster
        cluster_summary_parts = []
        for txt in formatted_txt_split:
            chunk_summary = chain.invoke({"context": txt})
            cluster_summary_parts.append(chunk_summary)
        
        # Join all summaries into a single summary for the cluster
        combined_summary = " ".join(cluster_summary_parts)

        summaries.append(combined_summary)
        cluster_labels.append(i)

    # Create a DataFrame to store summaries with their corresponding cluster and level
    df_summary = pd.DataFrame(
        {
            "summaries": summaries,
            "level": [level] * len(summaries),
            #"cluster": list(all_clusters),
            "cluster": cluster_labels
        }
    )

    pprint.pprint(df_summary)

    return df_clusters, df_summary


def recursive_embed_cluster_summarize(
    texts: List[str], level: int = 1, n_levels: int = 3
) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
    """
    Recursively embeds, clusters, and summarizes texts up to a specified level or until
    the number of unique clusters becomes 1, storing the results at each level.

    Parameters:
    - texts: List[str], texts to be processed.
    - level: int, current recursion level (starts at 1).
    - n_levels: int, maximum depth of recursion.

    Returns:
    - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion
      levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.
    """
    results = {}  # Dictionary to store results at each level

    # Perform embedding, clustering, and summarization for the current level
    df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)

    # Store the results of the current level
    results[level] = (df_clusters, df_summary)

    # Determine if further recursion is possible and meaningful
    unique_clusters = df_summary["cluster"].nunique()
    if level < n_levels and unique_clusters > 1:
        # Use summaries as the input texts for the next level of recursion
        new_texts = df_summary["summaries"].tolist()
        next_level_results = recursive_embed_cluster_summarize(
            new_texts, level + 1, n_levels
        )

        # Merge the results from the next level into the current results dictionary
        results.update(next_level_results)

    return results

In [None]:
def associate_files_with_level_1(data_df: pd.DataFrame, results: Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]):
    """
    Associate 'file' names from the original data DataFrame with the clusters at level 1 only.
    
    Parameters:
    - data_df: The original DataFrame containing 'file' and 'text' columns.
    - results: A dictionary containing clustering results at each level.

    Returns:
    - results: Updated results with 'file' names associated for level 1 clusters only.
    """
    # Ensure the text data is ordered properly in data_df
    data_df = data_df.reset_index(drop=True)

    # Level 1 only
    level = 1
    df_clusters, df_summary = results[level]
    
    # Map the 'text' in df_clusters back to the corresponding 'file' name in data_df
    file_mapping = data_df[['file', 'text']].set_index('text')['file'].to_dict()
    
    # Create a new 'file' column in df_clusters based on the mapping
    df_clusters["file"] = df_clusters["text"].map(file_mapping)

    # Update the results with the new DataFrame containing the 'file' column for level 1
    results[level] = (df_clusters, df_summary)

    return results


def display_level_1_clusters(results: Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]):
    """
    Display the clustering results at level 1, showing the 'file' names grouped by clusters.
    
    Parameters:
    - results: A dictionary containing clustering results at each level.
    """
    # Only display for level 1
    level = 1
    df_clusters, df_summary = results[level]
    
    print(f"--- Level {level} ---")
    
    # Debug: Print df_clusters structure and types
    print("Cluster DataFrame:")
    print(df_clusters.head())
    print("Data types:")
    print(df_clusters.dtypes)

    # Ensure the cluster column is string for grouping
    df_clusters['cluster'] = df_clusters['cluster'].astype(str)

    # Group by cluster to show all files in the same cluster together
    cluster_groups = df_clusters.groupby("cluster")["file"].apply(list).to_dict()
    
    # Display the cluster assignments for each cluster
    for cluster_id, files in cluster_groups.items():
        print(f"Cluster {cluster_id}: Files -> {files}")

    # Show the summaries generated for each cluster at this level
    #print("\nCluster Summaries:")
    #for idx, row in df_summary.iterrows():
    #    print(f"Cluster {row['cluster']} Summary (Level {row['level']}):\n{row['summaries']}")
    #    print("---")


# Associate 'file' names with level 1 cluster results
#results_with_level_1_files = associate_files_with_level_1(data_df, results)

# Call the function to display clusters for level 1 using file names
#display_level_1_clusters(results_with_level_1_files)


In [None]:
from langchain.docstore.document import Document

documents = [Document(page_content=text) for text in all_texts]

In [None]:
# Split
# Possible improvements - future hypertuning of chunk_size and chunk_overlap to improve results and try different slitters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=600)
splits = text_splitter.split_documents(documents)
pprint.pprint(splits[0:6])
pprint.pprint(len(splits))

In [None]:
#vectorstore_raptor_faiss = FAISS.from_documents(documents=splits, embedding=embd)
#vectorstore_raptor_faiss.save_local("raptor_faiss_index")

In [None]:
# Now, use all_texts to build the vectorstore with Chroma
#vectorstore_raptor = Chroma.from_documents(documents=splits, embedding=embd, persist_directory="raptor_chroma_index")
# vectorstore_raptor.persist()
#len(vectorstore_raptor.get()['documents'])

In [None]:
new_vectorstore = FAISS.load_local("raptor_faiss_index", embd, allow_dangerous_deserialization=True)
new_vectorstore.index.ntotal

In [None]:
retriever_raptor = new_vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Chain
raptor_rag_chain = (
    {"context": retriever_raptor | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Question
pprint.pprint(question)
pprint.pprint(raptor_rag_chain.invoke(question))

In [None]:
# Question
question2 = "Come funziona la gestione delle finestre modali in chrome con GPO?"
pprint.pprint(raptor_rag_chain.invoke(question2))

In [None]:
question4 = "Quando è morto Giulio Cesare?"
pprint.pprint(raptor_rag_chain.invoke(question4))

In [None]:
question5 = "Come avviene la gestione dei listini di acquisto?"
pprint.pprint(raptor_rag_chain.invoke(question5))

In [None]:
pprint.pprint(raptor_rag_chain.invoke(multiple_valid_questions))

In [None]:
pprint.pprint(raptor_rag_chain.invoke(q))