# Import libraries

In [None]:
import pandas as pd
import os
from langchain_community.document_loaders import DataFrameLoader
import pprint
from FlagEmbedding import BGEM3FlagModel
from langchain_community.vectorstores import FAISS
from sklearn.metrics.pairwise import cosine_similarity
from langchain.prompts import ChatPromptTemplate

import os
from langchain_openai import ChatOpenAI
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

import os
import google.generativeai as genai
from IPython.display import Markdown

genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
from langchain_google_genai import ChatGoogleGenerativeAI

# Auxiliary functions

## Data preprocessing

In [None]:
def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

In [None]:
model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

In [None]:
class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)
    
embd = M3EmbeddingFP16()

In [None]:
def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    sentences[-1]['distance_to_next'] = None  # or a default value

    return distances

## Generative part

In [None]:
# Prompt
template = """
Comportati come un assistente che risponde alle domande del cliente.
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}
Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.
Evita troppe ripetizioni nella risposta fornita.
Quando spieghi che cosa è o cosa significa un certo elemento richiesto, non parlarne come se fosse un problema.

In caso di più domande rispondi solo a quelle inerenti alla documentazione e rimani a disposizione per altre domande sull'argomento, specificando,
invece, che le altre domande non sono state trovate pertinenti in questo contesto.

Domanda relativa al software Panthera: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

# Post-processing
def format_docs(splits):
    return "\n\n".join(doc.page_content for doc in splits)

In [None]:
# LLM - the used model
generative_model = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Import dataset

In [None]:
path = os.getcwd()
csv_input_path = os.path.dirname(path) + "/Doc_Panthera_Augmented/augmented_dataset_final_outputs.csv"
csv_input_path

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv(csv_input_path, encoding='utf-8')

# Display the first few rows of the DataFrame to check the contents
display(df)

# Clean data

In [None]:
loader = DataFrameLoader(df, page_content_column="Text")
docs_data = loader.load()

In [None]:
generative_model = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro-latest",
    temperature=0
)

In [None]:
import importlib
import Data_preprocessing
importlib.reload(Data_preprocessing)

# Initialize the Preprocessing object
preprocessing = Data_preprocessing.Preprocessing()

# Iterate through each document in docs_data and clean the text
for doc in docs_data:
    cleaned_content = preprocessing.clean_text_template(doc.page_content)
    doc.page_content = cleaned_content

pprint.pprint(docs_data)

In [None]:
import re
import numpy as np
import pandas as pd
import pprint
from langchain_community.document_loaders.csv_loader import CSVLoader
from sklearn.metrics.pairwise import cosine_similarity

# Function to chunk a document semantically based on cosine distances
def chunk_document(doc, embd, buffer_size=1, breakpoint_percentile_threshold=95, max_seq_length=8000):
    # Split text into sentences
    pprint.pprint(doc.metadata)
    single_sentences_list = re.split(r'(?<=[.?!])\s+', doc.page_content)
    print(f"{len(single_sentences_list)} sentences were found")

    # Create sentence dictionaries
    sentences = [{'sentence': x, 'index': i} for i, x in enumerate(single_sentences_list)]
    
    # Combine sentences with buffer (e.g., 1 sentence buffer to smooth chunks)
    sentences = combine_sentences(sentences, buffer_size=buffer_size)
    
    # Truncate combined sentences if they exceed the maximum sequence length
    for sentence in sentences:
        if len(sentence['combined_sentence']) > max_seq_length:
            sentence['combined_sentence'] = sentence['combined_sentence'][:max_seq_length]
    
    # Embedding sentences
    embeddings = embd([x['combined_sentence'] for x in sentences])
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = embeddings[i]

    # Calculate cosine distances
    distances = calculate_cosine_distances(sentences)

    # Set distance threshold for breakpoints
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)

    # Get indices of breakpoints
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]

    # Create chunks based on breakpoints
    start_index = 0
    chunks = []
    for index in indices_above_thresh:
        end_index = index
        group = sentences[start_index:end_index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        start_index = index + 1

    # Handle last chunk if any sentences remain
    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)

    return chunks

# Assuming docs_data is a list of Document objects
all_chunks = []

for doc in docs_data:
    # Chunk each document semantically
    chunks = chunk_document(doc, embd, buffer_size=1, breakpoint_percentile_threshold=95)
    all_chunks.extend(chunks)

# Create a DataFrame with the chunks
df = pd.DataFrame(all_chunks, columns=['chunk'])
display(df[:3])

# Save the DataFrame to a CSV file
csv_file_path = "semantic_chunks_augmented_dataset.csv"
df.to_csv(csv_file_path, sep="|", index=False)

# Load the data from the CSV file using the CSVLoader
loader = CSVLoader(file_path=csv_file_path)
data = loader.load()
data[:3]

In [None]:
new_vectorstore = FAISS.load_local("augmented_faiss_index", embd, allow_dangerous_deserialization=True)
new_vectorstore,  new_vectorstore.index.ntotal

In [None]:
 # Index
# Data driven changes - change 'k' the number of retrieved documents given the query
retriever = new_vectorstore.as_retriever(search_kwargs={"k": 4})
question = "Quando mi conviene gestire un articolo a PSO rispetto a pianificazione?"
docs = retriever.get_relevant_documents(question)
docs