In [None]:
# Load the original dataset for RAG
import pandas as pd

csv_name = "Gemini_augmented_dataset.csv"
data_df = pd.read_csv(csv_name, names = ['file_name', 'updated_text'], delimiter="|")
data_df = data_df.drop(index = 0)
data_df

In [None]:
from langchain_community.document_loaders import DataFrameLoader
import pprint

loader = DataFrameLoader(data_df, page_content_column="updated_text")
docs_data = loader.load()

In [None]:
essay = docs_data[0].page_content
pprint.pprint(essay)

# Data preprocessing steps

## Auxiliary functions and models

In [None]:
def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

In [None]:
from FlagEmbedding import BGEM3FlagModel
model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)

In [None]:
from langchain_community.vectorstores import FAISS

embd = M3EmbeddingFP16()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    sentences[-1]['distance_to_next'] = None  # or a default value

    return distances

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Evita troppe ripetizioni nella risposta fornita.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

In [None]:
import os
from langchain_openai import ChatOpenAI
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# LLM - the used model
generative_model = ChatOpenAI(model_name="gpt-4o", temperature=0)

## Data preprocessing analysis

In [None]:
preprocessing_description = "No preprocessing"
chunking_description = "Semantic chunking"

In [None]:
import importlib
import Data_preprocessing
importlib.reload(Data_preprocessing)

# Initialize the Preprocessing object
preprocessing = Data_preprocessing.Preprocessing()

# Clean the essay text using the clean_text_template method
cleaned_essay = preprocessing.clean_text_template(essay)

# Print the cleaned text
pprint.pprint(cleaned_essay)

In [None]:
import re

# Splitting the text on '.', '?', and '!'
single_sentences_list = re.split(r'(?<=[.?!])\s+', cleaned_essay)
print (f"{len(single_sentences_list)} senteneces were found")

sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(single_sentences_list)]
sentences = combine_sentences(sentences, buffer_size=1)
embeddings = embd([x['combined_sentence'] for x in sentences])
for i, sentence in enumerate(sentences):
    sentence['combined_sentence_embedding'] = embeddings[i]

pprint.pprint(sentences[0])

In [None]:
import matplotlib.pyplot as plt
import numpy as np

distances = calculate_cosine_distances(sentences)

plt.plot(distances)

plt.xlim(0, len(distances))

# We need to get the distance threshold that we'll consider an outlier
# We'll use numpy .percentile() for this
breakpoint_percentile_threshold = 95
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold) # If you want more chunks, lower the percentile cutoff
plt.axhline(y=breakpoint_distance_threshold, color='r', linestyle='-');

# Then we'll see how many distances are actually above this one
num_distances_above_theshold = len([x for x in distances if x > breakpoint_distance_threshold]) # The amount of distances above your threshold
plt.text(x=(len(distances)*.01), y = 0, s=f"{num_distances_above_theshold + 1} Chunks");

# Then we'll get the index of the distances that are above the threshold. This will tell us where we should split our text
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list

plt.title("Chunks Based On Embedding Breakpoints")
plt.xlabel("Index of sentences")
plt.ylabel("Cosine distance between sequential sentences")
plt.show()

In [None]:
# Initialize the start index
start_index = 0

# Create a list to hold the grouped sentences
chunks = []

# Iterate through the breakpoints to slice the sentences
for index in indices_above_thresh:
    # The end index is the current breakpoint
    end_index = index

    # Slice the sentence_dicts from the current start index to the end index
    group = sentences[start_index:end_index + 1]
    combined_text = ' '.join([d['sentence'] for d in group])
    chunks.append(combined_text)
    
    # Update the start index for the next group
    start_index = index + 1

# The last group, if any sentences remain
if start_index < len(sentences):
    combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
    chunks.append(combined_text)

for i, chunk in enumerate(chunks[-3:]):  
    print(f"Chunk #{i}")
    pprint.pprint(chunk)
    print("\n")

In [None]:
import pandas as pd
from langchain_community.document_loaders.csv_loader import CSVLoader

# Create a DataFrame with the given data
df = pd.DataFrame(chunks)
display(df[:3])

# Save the DataFrame to a CSV file
csv_file_path = "temporary_file.csv"
df.to_csv(csv_file_path, sep = "|", index=False)

loader = CSVLoader(file_path=csv_file_path)
data = loader.load()
data[:3]

In [None]:
vectorstore = FAISS.from_documents(documents=data, embedding=embd)
k = 4
retriever = vectorstore.as_retriever(search_kwargs={"k": k})

In [None]:
question = "Come posso decidere se nel calcolo della percentuale di saturazione del contratto vadano considerate anche la quantità in previsione?"
retrieved_docs = retriever.invoke(question)
pprint.pprint(retrieved_docs)

In [None]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | generative_model
    | StrOutputParser()
)

# Question
rag_chain_response = rag_chain.invoke(question)

In [None]:
import os
import json
import pandas as pd

# Prepare data for CSV
csv_data = []

# Manually convert Document objects to dictionaries
retrieved_docs_dict = [{
    "metadata": doc.metadata,
    "page_content": doc.page_content
} for doc in retrieved_docs]

# The new entry to be added
new_entry = {
    "question": question,
    "retrieved_doc": json.dumps(retrieved_docs_dict),  # Convert the list of dicts to JSON string
    "retrieved_doc_k": k,
    "rag_chain_response": rag_chain_response,
    "chunking": chunking_description,
    "description": preprocessing_description
}

# Define output file path
output_file = "retrieved_docs_rag_chain_results.csv"

# Load existing data into a DataFrame if the CSV exists, else create a new DataFrame
if os.path.exists(output_file):
    df = pd.read_csv(output_file, delimiter="|")
else:
    # Define the columns
    columns = ["question", "retrieved_doc", "retrieved_doc_k", "rag_chain_response", "chunking", "description"]
    df = pd.DataFrame(columns=columns)

# Convert the new entry into a DataFrame
new_entry_df = pd.DataFrame([new_entry])

# Concatenate the new entry with the existing DataFrame
df = pd.concat([df, new_entry_df], ignore_index=True)
df = df.drop_duplicates(ignore_index=True)

df.to_csv(output_file, mode="w", header=True, index=False, sep="|")
print(f"Data successfully saved to {output_file}")

# Analyze the results with different preprocessing steps

In [None]:
import csv
import pandas as pd

# File path (same as the output file from the previous script)
input_file = "retrieved_docs_rag_chain_results.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(input_file, delimiter="|")

# Display the DataFrame
display(df)

In [None]:
import json

# Access the first row (index 0)
row = df.iloc[0]

# Deserialize the JSON string back into a Python object (list of dictionaries)
retrieved_doc = json.loads(row["retrieved_doc"])

for doc in retrieved_doc:
    print(f"  Metadata: {doc['metadata']}")
    pprint.pprint(f"  Page Content: {doc['page_content']}")
    print("-" * 80)  # Print a separator line for better readability