# Retrieval and chunking experiments, with data preprocessing

In [None]:
# Load the original dataset for RAG
import pandas as pd

csv_name = "Gemini_augmented_dataset.csv"
data_df = pd.read_csv(csv_name, names = ['file_name', 'updated_text'], delimiter="|")
data_df = data_df.drop(index = 0)
data_df

# RecursiveCharacterTextSplitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import DataFrameLoader
import pprint

loader = DataFrameLoader(data_df, page_content_column="updated_text")
docs_data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)

In [None]:
# Split documents and keep track of chunk numbers within each document
splits = []
for doc in docs_data:
    # Split each document into chunks
    doc_chunks = text_splitter.split_documents([doc])
    
    # Add chunk number as metadata
    for chunk_num, chunk in enumerate(doc_chunks):
        chunk.metadata["chunk_number"] = chunk_num + 1  # Adding 1 to start counting from 1
        splits.append(chunk)

# Print the first few splits with chunk numbers
pprint.pprint(splits[0:6])
pprint.pprint(len(splits))

In [None]:
pprint.pprint(splits[10].page_content)

# Semantic Chunking

Find break points between sequential sentences - A walk method. I started at the first sentence, got the embedding, then compared it to sentence #2, then compared #2 and #3 and so on. I was looking for "break points" where embedding distance was large. If it was above a threshold, then I considered it the start of a new semantic section. I originally tried taking embeddings of every sentence, but this turned out to be too noisy. So I ended up taking groups of 3 sentences (a window), then got an embedding, then dropped the first sentence, and added the next one. This worked out a bit better.

In [None]:
essay = docs_data[0].page_content
pprint.pprint(essay)

In [None]:
import re

# Splitting the text on '.', '?', and '!'
single_sentences_list = re.split(r'(?<=[.?!])\s+', essay)
print (f"{len(single_sentences_list)} senteneces were found")

In [None]:
sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(single_sentences_list)]
sentences[:3]

In [None]:
def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

sentences = combine_sentences(sentences, buffer_size=1)

In [None]:
sentences[:3]

Now I want to get embeddings for the combined sentences, so we can get the distances between the groups of 3 and find breakpoints.

In [None]:
from FlagEmbedding import BGEM3FlagModel
model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
from langchain_community.vectorstores import FAISS

embd = M3EmbeddingFP16()
embeddings = embd([x['combined_sentence'] for x in sentences])

In [None]:
for i, sentence in enumerate(sentences):
    sentence['combined_sentence_embedding'] = embeddings[i]

In [None]:
sentences[0:3]

Compute the cosine distances between sequential embedding pairs to see where the break points are.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    sentences[-1]['distance_to_next'] = None  # or a default value

    return distances

In [None]:
distances = calculate_cosine_distances(sentences)

In [None]:
sentences[:3]

In [None]:
import matplotlib.pyplot as plt
plt.plot(distances[0])

It's interesting to see sections where distances are smaller and then areas of larger distances. What stands out to me most is the outliers which are spread out.
There are many ways to chunk up the essay based off these distances, but I'm going to consider any distance above the 95th percentile of distances as a break point. This is the only parameter we'll need to config.

In [None]:
import numpy as np

distances = distances[0]
plt.plot(distances)

plt.xlim(0, len(distances))

# We need to get the distance threshold that we'll consider an outlier
# We'll use numpy .percentile() for this
breakpoint_percentile_threshold = 95
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold) # If you want more chunks, lower the percentile cutoff
plt.axhline(y=breakpoint_distance_threshold, color='r', linestyle='-');

# Then we'll see how many distances are actually above this one
num_distances_above_theshold = len([x for x in distances if x > breakpoint_distance_threshold]) # The amount of distances above your threshold
plt.text(x=(len(distances)*.01), y = 0, s=f"{num_distances_above_theshold + 1} Chunks");

# Then we'll get the index of the distances that are above the threshold. This will tell us where we should split our text
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list

plt.title("PG Essay Chunks Based On Embedding Breakpoints")
plt.xlabel("Index of sentences in essay (Sentence Position)")
plt.ylabel("Cosine distance between sequential sentences")
plt.show()

In [None]:
# Initialize the start index
start_index = 0

# Create a list to hold the grouped sentences
chunks = []

# Iterate through the breakpoints to slice the sentences
for index in indices_above_thresh:
    # The end index is the current breakpoint
    end_index = index

    # Slice the sentence_dicts from the current start index to the end index
    group = sentences[start_index:end_index + 1]
    combined_text = ' '.join([d['sentence'] for d in group])
    chunks.append(combined_text)
    
    # Update the start index for the next group
    start_index = index + 1

# The last group, if any sentences remain
if start_index < len(sentences):
    combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
    chunks.append(combined_text)

In [None]:
for i, chunk in enumerate(chunks[-3:]):
    buffer = 200
    
    print (f"Chunk #{i}")
    print (chunk[:buffer].strip())
    print ("...")
    print (chunk[-buffer:].strip())
    print ("\n")

In [None]:
import pandas as pd

# Create a DataFrame with the given data
df = pd.DataFrame(chunks, columns=["File-VEN"])
display(df[:3])

# Save the DataFrame to a CSV file
csv_file_path = "trial_semantic_chunks.csv"
df.to_csv(csv_file_path, sep = "|", index=False)

# Return the path to the CSV file
csv_file_path

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path=csv_file_path)
data = loader.load()
data[:3]

In [None]:
from langchain_community.vectorstores import FAISS

embd = M3EmbeddingFP16()
vectorstore = FAISS.from_documents(documents=data, embedding=embd)

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
question = "Come posso decidere se nel calcolo della percentuale di saturazione del contratto vadano considerate anche la quantità in previsione?"
retrieved_docs = retriever.invoke(question)
retrieved_docs

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Evita troppe ripetizioni nella risposta fornita.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

In [None]:
import os
from langchain_openai import ChatOpenAI
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
# LLM - the used model
model = ChatOpenAI(model_name="gpt-4o", temperature=0)
# max_token

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Question
pprint.pprint(rag_chain.invoke(question))

# Data preprocessing steps

## Remove only the index

In [None]:
import importlib
import Data_preprocessing
importlib.reload(Data_preprocessing)

# Initialize the Preprocessing object
preprocessing = Data_preprocessing.Preprocessing()

# Clean the essay text using the clean_text_template method
cleaned_essay_1 = preprocessing.clean_text_template(essay)

# Print the cleaned text
pprint.pprint(cleaned_essay_1)


## Remove index and tables

In [None]:
import importlib
import Data_preprocessing
importlib.reload(Data_preprocessing)

# Initialize the Preprocessing object
preprocessing = Data_preprocessing.Preprocessing()

# Clean the essay text using the clean_text_template method
cleaned_essay_2 = preprocessing.clean_text_template(essay)

# Print the cleaned text
pprint.pprint(cleaned_essay_2)

# Remove index and tables and flatten lists

In [None]:
import importlib
import Data_preprocessing
importlib.reload(Data_preprocessing)

# Initialize the Preprocessing object
preprocessing = Data_preprocessing.Preprocessing()

# Clean the essay text using the clean_text_template method
cleaned_essay_3 = preprocessing.clean_text_template(essay)

# Print the cleaned text
pprint.pprint(cleaned_essay_3)

# Remove unwanted patterns and normalize whitespaces

In [None]:
import importlib
import Data_preprocessing
importlib.reload(Data_preprocessing)

# Initialize the Preprocessing object
preprocessing = Data_preprocessing.Preprocessing()

# Clean the essay text using the clean_text_template method
cleaned_essay_4 = preprocessing.clean_text_template(essay)

# Print the cleaned text
pprint.pprint(cleaned_essay_4)

# Monitor the data augmentation with Gemini

In [None]:
# Monitor how many files were processed
import os

def count_final_output_files(directory):
    # Initialize a counter
    count = 0

    # Iterate over all files in the given directory
    for filename in os.listdir(directory):
        # Check if the file ends with "_final_output.txt"
        if filename.endswith('_final_output.txt'):
            count += 1
    
    return count

count_final_output_files('./Doc_Panthera_Augmented')