Basic RAG pipeline with Gemini: 
- no image augmented dataset
- no pre-processing
- recursive chunking with baseline values, big chunks of 3000 size and 500 overlap
- dense retriever with k = 6
- bge-m3
- no question re-writing
- no reranking
- no RRF
- simple prompt, without preprocessing
- Gemini LLM

# Import libraries

In [None]:
import pandas as pd
import pprint

# Import model

In [None]:
import os
import google.generativeai as genai
from IPython.display import Markdown

genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

model_gemini = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro-latest",
    temperature=0
)

# Load dataset - vectorstore

In [None]:
from FlagEmbedding import BGEM3FlagModel
from langchain_community.vectorstores import FAISS

model_fp16 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

class M3EmbeddingFP16:
    def embed_documents(self, texts):
        return model_fp16.encode(texts)['dense_vecs']
    
    def __call__(self, texts):
        return self.embed_documents(texts)
    
embd = M3EmbeddingFP16()

In [None]:
# Contains the documents without any data preprocessing steps
vectorstore = FAISS.load_local("local_model_index", embd, allow_dangerous_deserialization=True)
vectorstore, vectorstore.index.ntotal

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Baseline - Gemini Basic RAG

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

first_basic_template = """   
Comportati come un assistente che risponde alle domande del cliente.   
Rispondi alla domanda basandoti solo sui seguenti documenti: {context}.   

Rispondi in modo conciso e chiaro, spiegando passo passo al cliente le azioni necessarie da effettuare.   
Se possibile, dai indicazioni dettagliate al cliente, su come risolvere il problema o effettuare l'azione desiderata.

Domanda relativa al software Panthera: {question}   
"""

generation_prompt = ChatPromptTemplate.from_template(first_basic_template)

In [None]:
# Post-processing
def baseline_format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
basic_rag_chain = (
    {"context": retriever | baseline_format_docs, "question": RunnablePassthrough()}
    | generation_prompt
    | model_gemini
    | StrOutputParser()
)

# Evaluate the baseline Basic RAG pipeline on a small testset

In [None]:
import pandas as pd

# Load the saved CSV file
eval_df = pd.read_csv('filtered_matching_questions.csv')

# Display the first few rows of the loaded DataFrame
display(eval_df)

In [None]:
import importlib
import E2E_Evaluation_metrics
importlib.reload(E2E_Evaluation_metrics)
from E2E_Evaluation_metrics import RAGEvaluator
from E2E_Evaluation_metrics import SemScoreQueryRewriting

evaluator = RAGEvaluator()
semscore = SemScoreQueryRewriting()

In [None]:
import pandas as pd

def generate_answers(generation_chain, df, model_name, chunking_type, preprocessing, retriever, techniques):
    # Create a copy of the original dataframe to avoid modifying it
    new_df = df.copy()
    new_df['generated_answer'] = None
    new_df['model'] = None
    new_df['chunking'] = None
    new_df['preprocessing'] = None
    new_df['retriever'] = None
    new_df['advanced_techniques'] = None

    # Iterate through the dataframe and generate answers
    for idx, elem in new_df.iterrows():
        question = elem["question"]
        new_df.at[idx, 'generated_answer'] = generation_chain.invoke(question) 
        new_df.at[idx, 'model'] = model_name
        new_df.at[idx, 'chunking'] = chunking_type
        new_df.at[idx, 'preprocessing'] = preprocessing
        new_df.at[idx, 'retriever'] = retriever
        new_df.at[idx, 'advanced_techniques'] = techniques

    return new_df

In [None]:
df = generate_answers(basic_rag_chain, eval_df, 'Gemini', 'Recursive Baseline', 'No Preprocessing and no augmentation', 'Dense-6', 'No advanced techniques, but prompt engineering')

In [None]:
columns_to_drop = ['BLEU', 'ROUGE-2', 'ROUGE-L', 'BERT P', 'BERT R', 'Perplexity', 'Diversity']

def evaluate_responses(eval_df, evaluator):
    results = []
    for _, row in eval_df.iterrows():
        response = row['generated_answer']
        reference = row['answer']
        
        # Check if either response or reference is empty, and skip this row
        if not response or not reference:
            continue
        
        # Evaluate and store the results
        evaluation = evaluator.evaluate_all(response, reference)
        results.append(evaluation)
    
    # Convert results to a DataFrame
    eval_df = pd.DataFrame(results)
    return eval_df


def process_evaluation_and_metrics(data_frame, model_name, evaluator = evaluator, semscore = semscore, columns_to_drop = columns_to_drop):
    """
    Evaluate responses, compute semantic scores, and merge results into a DataFrame.

    Parameters:
    - data_frame (pd.DataFrame): The input DataFrame with original and rewritten questions.
    - evaluator (object): The evaluation object to compute BLEU, ROUGE, etc.
    - semscore (object): The semantic score computation object.
    - model_name (str): Name of the model for semantic similarity scoring.
    - columns_to_drop (list): List of columns to drop from the evaluated DataFrame.

    Returns:
    - pd.DataFrame: Updated DataFrame with evaluation metrics and semantic scores.
    """
    # Step 1: Evaluate responses
    eval_df = evaluate_responses(data_frame, evaluator)
    
    # Step 2: Drop unnecessary columns
    eval_df = eval_df.drop(columns=columns_to_drop, errors="ignore")

    # Step 3: Compute semantic scores
    reference = "answer"
    response = "generated_answer"
    cosine_similarities_bge, _ = semscore.compute_sem_score(data_frame, model_name=model_name, reference=reference, response=response)
    eval_df["SemScore"] = cosine_similarities_bge["Cosine_Similarity"]

    # Step 4: Merge original DataFrame with evaluation metrics
    merged_df = pd.concat([data_frame, eval_df], axis=1)

    return merged_df

In [None]:
eval_df = process_evaluation_and_metrics(
    data_frame=df, 
    model_name='BAAI/bge-m3'
)

display(eval_df)

In [None]:
# Optionally, save the filtered dataframe to a CSV file
eval_df.to_csv('ResultsOnTestset/Baseline.csv', index=False)

In [None]:
import pandas as pd

def compute_average_value(df, output_file):
    # Compute the averages
    mean_rouge = df['ROUGE-1'].mean()
    mean_bert = df['BERT F1'].mean()
    mean_sem = df['SemScore'].mean()

    # Get model and other details
    model = df['model'].unique()
    chunking = df["chunking"].unique()
    preprocessing = df['preprocessing'].unique()
    retriever = df['retriever'].unique()
    advanced_techniques = df["advanced_techniques"].unique()

    # Create a dictionary of the results
    results = {
        'Model': model,
        'Chunking': chunking,
        'Preprocessing': preprocessing,
        'Retriever': retriever,
        'Advanced Techniques': advanced_techniques,
        'Mean ROUGE-1': mean_rouge,
        'Mean BERT F1': mean_bert,
        'Mean SemScore': mean_sem
    }

    # Convert the dictionary to a DataFrame
    results_df = pd.DataFrame([results])

    # Append the results to the CSV file (if it exists, otherwise create a new one)
    results_df.to_csv(output_file, mode='a', header=not pd.io.common.file_exists(output_file), index=False)

    # Print the results (optional)
    print("Model:", model, "with chunking of type:", chunking, "that uses", 
          preprocessing, retriever, advanced_techniques)
    print(f"Mean ROUGE-1: {mean_rouge}")
    print(f"Mean BERT F1: {mean_bert}")
    print(f"Mean SemScore: {mean_sem}")

In [None]:
compute_average_value(eval_df, "ResultsMeanScore/Baseline.csv")