In [None]:
import pandas as pd
import numpy as np
import string
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama
from qdrant_client.http import models
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rouge_score import rouge_scorer
from langchain.load import dumps, loads
from operator import itemgetter
import warnings

# Suppress UserWarning from transformers related to Flash Attention
warnings.filterwarnings("ignore", category=UserWarning, module="transformers.models.bert.modeling_bert")

# Constants
COMPANY_DATA = './data/companies_data.csv'
REFERENCE_DATA = './data/references.csv'
VECTOR_SIZE = 1024
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 200
LLM_PRETRAINED_MODEL = "llama3.1"
MODEL_NAME = "BAAI/bge-large-en"
QDRANT_URL = "http://localhost:6333"
COLLECTION_NAME = "vector_db"
def pre_process(df):
    cols = df.columns
    first_col = df[cols[0]].tolist()
    second_col = df[cols[1]].tolist()
    first_col = [f.translate(str.maketrans('', '', string.punctuation)) for f in first_col]
    second_col = [s.translate(str.maketrans('', '', string.punctuation)) for s in second_col]
    df_pre_processed = pd.DataFrame({cols[0]: first_col, cols[1]: second_col})
    return df_pre_processed

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def evaluate_result(result, ground_truth):
    scores = scorer.score(ground_truth, result)
    rouge1 = scores['rouge1'].fmeasure
    rouge2 = scores['rouge2'].fmeasure
    rougel = scores['rougeL'].fmeasure
    return rouge1, rouge2, rougel

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Load and pre-process reference data
df_references = pre_process(pd.read_csv(REFERENCE_DATA))
questions = df_references['question']
answers = df_references['expected_answer']

# Load and pre-process companies data
df = pre_process(pd.read_csv(COMPANY_DATA))
df = df.drop_duplicates(subset=['companyName'])
company_names = df['companyName'].tolist()
descriptions = df['description'].tolist()

# Tokenize the text
text_splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
company_name_tokens = [text_splitter.split_text('companyName: '+name) for name in company_names]
description_tokens = [text_splitter.split_text('description: '+desc) for desc in descriptions]

# Create the corpus
corpus = [Document(page_content="\n".join(name + desc), metadata={"id": i}) for i, (name, desc) in enumerate(zip(company_name_tokens, description_tokens))]

# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
splits = text_splitter.split_documents(corpus)

# Pre-trained LLM (LLAMA 3.1, 8B parameters)
llm = Ollama(model=LLM_PRETRAINED_MODEL)

# Initialize embedding model
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceBgeEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Initialize Qdrant client and create collection if it doesn't exist
client = QdrantClient(
    url=QDRANT_URL, prefer_grpc=False
)

if not client.collection_exists(COLLECTION_NAME):
    # Create the collection if it doesn't exist
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=VECTOR_SIZE,  # Adjust according to your needs
            distance=models.Distance.COSINE
        )
    )

# Store the database
db = Qdrant.from_documents(
    splits,
    embeddings,
    url=QDRANT_URL,
    prefer_grpc=False,
    collection_name=COLLECTION_NAME
)

# Retriever with top-k=2
retriever = db.as_retriever(search_kwargs={"k":2})

# Multi-query: different perspectives
template = """You are an AI language model assistant. Your task is to generate three 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)
# Chain of multi-query generation and get union of docs
retrieval_chain = generate_queries | retriever.map() | get_unique_union

# Initialize rouge scorer and cosine-similarity list
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rouge2_scores = []
rougel_scores = []

cosine_similarities = []

# Retrieve multi-query per question and evaluation
for idx, question in enumerate(questions):
    print(f"Question {idx} :")
    print(question + "?")
    print("GROUND TRUTH:")
    ground_truth = answers[idx]
    print(ground_truth)

    # Retrieve multi-query per question
    docs = retrieval_chain.invoke({"question":question})

    # Main prompt template
    template = """You are an AI assistant that you know some companies and their description. Consider companyName and its description as a pair to answer the question. Some questions give you some information about a company and ask you its name, some questions ask some details about a company name. Answer the question in a single sentence. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that I don't know, don't try to make up an answer.
    {context}
    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    final_rag_chain = (
        {"context": retrieval_chain,
        "question": itemgetter("question")}
        | prompt
        | llm
        | StrOutputParser()
    )

    print("RAG Multi-Query RESULT:")
    result = final_rag_chain.invoke({"question":question})
    print(result)
    print('-'.join('' for x in range(100)))

    rouge1, rouge2, rougel = evaluate_result(result, ground_truth)
    rouge1_scores.append(rouge1)
    rouge2_scores.append(rouge2)
    rougel_scores.append(rougel)

    # Calculate cosine similarity
    result_embedding = embeddings.embed_query(result)
    ground_truth_embedding = embeddings.embed_query(ground_truth)
    cosine_sim = cosine_similarity(result_embedding, ground_truth_embedding)
    cosine_similarities.append(cosine_sim)

print("Evaluation: ")
print(f"ROUGE-1: {np.mean(rouge1_scores):.3f}")
print(f"ROUGE-2: {np.mean(rouge2_scores):.3f}")
print(f"ROUGE-L: {np.mean(rougel_scores):.3f}")

print('.'.join('' for x in range(100)))
print(f"COSINE_SIMILARITY: {np.mean(cosine_similarities):.3f}")
