In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import torch
import os


In [None]:
# This our RAG which is organized in one function
def query_answering_system(query, document_dataset, embedding_model_name='BAAI/bge-small-en', model_name="Qwen/Qwen2.5-0.5B", k=3, embedding_cache_path='./document_embeddings.npy'):
    """
    Given a query and a document dataset, this function retrieves relevant documents and generates an answer.

    Parameters:
    - query (str): The user query.
    - document_dataset (str): Path to the CSV file containing documents with 'title' in column index 1 and 'text' in column index 2.
    - embedding_model_name (str): Name of the sentence embedding model.
    - model_name (str): Name of the language model.
    - k (int): Number of top similar documents to retrieve.
    - embedding_cache_path (str): Path to store/load document embeddings.

    Returns:
    - answer (str): Generated response based on retrieved documents.
    """
    
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    # Load pre-trained models
    embedding_model = SentenceTransformer(embedding_model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    generator_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

    # Load dataset
    df = pd.read_csv(document_dataset, encoding='utf-8')
    documents = [f"title: {d[1]}.  text: {d[2]}" for d in df.values.tolist()]

    # Function to generate embeddings
    def generate_embeddings(texts):
        return embedding_model.encode(texts, show_progress_bar=True, batch_size=160, device=device)

    # Generate/load document embeddings
    if not os.path.exists(embedding_cache_path):
        documents_embedding = generate_embeddings(documents)
        np.save(embedding_cache_path, documents_embedding)
    else:
        documents_embedding = np.load(embedding_cache_path)

    # Retrieve top-k similar documents
    query_embedding = generate_embeddings([query])
    similarities = cosine_similarity(query_embedding, documents_embedding)
    most_similar_indices = similarities.argsort()[0][-k:][::-1]
    retrieved_docs = [documents[i] for i in most_similar_indices]

    # Construct the prompt
    prompt = "Given the following documents:\n"
    prompt += "\n".join(f"{i+1}. {doc}" for i, doc in enumerate(retrieved_docs))
    prompt += f"\n\nUser query: {query}\n\n"
    prompt += "Based on the above documents, provide a concise, clear, and logically structured answer to the user's query.\n"
    prompt += "Also please give me the basis for your answer."

    # Generate response
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(generator_model.device)

    generated_ids = generator_model.generate(**model_inputs, max_new_tokens=512)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    answer = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return answer


In [None]:
# Example usage of RAG:
query = "What option do civil servants in Malaysia have for their working hours during Ramadan, according to Communications Minister Fahmi Fadzil?"
document_dataset = "./data/1K_news.csv"
result = query_answering_system(query, document_dataset)
print(result)

In [None]:
from openai import OpenAI
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

df = pd.read_csv('./data/1K_news.csv', encoding='utf-8')
client.api_key = os.getenv("OPENAI_API_KEY")

# Ensure df exists and contains the necessary columns
df = df.head(50)  # Select the first 50 rows

def generate_question(text):
    prompt = (
        "Generate a simple, easy-to-understand and not too long question based on the following news content."
        "Also, the question must be specific and clear. For example, instead of using 'servants,' it should specify which region or country’s servants are being referred to."
        "Additionally, the question should not be too difficult, and the answer must be explicitly contained within the following News content.\n\n"
        f"News content:\n{text}\n\n"
        "Question:"
    )

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    return completion.choices[0].message.content.strip()

# Apply the function to generate questions
df["generated_question"] = df["text"].apply(generate_question)

# Save the results to a CSV file
df.to_csv("./data/50_news_questions.csv", index=False, encoding='utf-8')

print("Question generation completed. Results saved to news_questions.csv")


In [None]:
df = pd.read_csv('./data/50_news_questions.csv', encoding='utf-8')
document_dataset = "./data/1K_news.csv"
for i in range(len(df)):
    query = df.loc[i, 'generated_question']
    generated_response = query_answering_system(query, document_dataset)
    query = df.loc[i, 'generated_response'] = generated_response

    print(i)

df.to_csv("./data/50_news_QA.csv", index=False, encoding='utf-8')