In [None]:
# !pip install -q -U transformers
!pip install -q transformers==4.50.0  # for using llm model's "microsoft/Phi-4-mini-instruct"
!pip install -q -U sentence-transformers

In [None]:
import transformers
print(f"Transformers version: {transformers.__version__}")

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
df_infos = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")
df_questions = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

In [None]:
df_infos.rename(columns={"passage": "text"}, inplace=True)
df_questions.reset_index(drop=True, inplace=True)
df_predict = df_questions.copy()

In [None]:
model_similarity_name= "all-mpnet-base-v2"
# llm_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# llm_model_name = "Qwen/Qwen3-1.7B"
llm_model_name = "microsoft/Phi-4-mini-instruct"
model_similarity = SentenceTransformer(model_similarity_name)

In [None]:
def llm_loader(model_name:str):
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map='auto')
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  return model, tokenizer

In [None]:
def llm_generator(model, tokenizer, query:str, vector_database, max_new_tokens=200, top_k=3):
  input_text = build_contextual_prompt(query, vector_database)
  chat_format = tokenizer.apply_chat_template(input_text, tokenize=False, add_generation_prompt=True)
  encoded_data = tokenizer(chat_format, return_tensors="pt").to(model.device)
  generated_ids = model.generate(
          **encoded_data,
          max_new_tokens=max_new_tokens
      )
  generated_ids = [
          output_ids[len(input_ids):]
          for input_ids, output_ids in zip(encoded_data["input_ids"], generated_ids)
      ]
  decoded_data = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  return decoded_data

In [None]:
def database_creator(df:pd.DataFrame, col_name:str):
  vec_db = []
  for info in df[col_name]:
    embedding = model_similarity.encode(info, convert_to_tensor=True)
    vec_db.append((info, embedding))
  return vec_db

In [None]:
# def retrieve(query:str, vec_database, top_k=3):
#   query_embedding = model_similarity.encode(query)
#   similarity_scores = []
#   for info, embedding in vec_database:
#     cos_sim = util.cos_sim(query_embedding, embedding)
#     similarity_scores.append((info, cos_sim))
#     similarity_scores = sorted(similarity_scores, key=lambda x:x[1], reverse=True)
#   return similarity_scores[:top_k]

In [None]:
def retrieve(query:str, vector_database, top_k=3):
  query_embedding = model_similarity.encode(query, convert_to_tensor=True)
  database_texts = [item[0] for item in vector_database]
  database_embeddings = [item[1] for item in vector_database]
  similarity_scores = util.semantic_search(query_embedding, database_embeddings, top_k=top_k)[0]
  return [(database_texts[score["corpus_id"]], score["score"]) for score in similarity_scores]

In [None]:
def build_contextual_prompt(query:str, vector_db, top_k=3):
  retrieve_query = retrieve(query, vector_db, top_k)
  texts = [text[0] for text in retrieve_query]
  context = '\n'.join(texts)
  # instruction = 'You are Qwen an advanced model and a knowledgeable assistant. Answer the question using only the provided context.'

  # instruction = """You are Qwen, a helpful assistant.
  #                  Answer the user's question using only the provided context.
  #                  If the question can be answered with 'Yes', 'No', or a single word, provide only that single word as the answer,
  #                  without any additional explanation or context."""

  instruction = """You are Qwen, an advanced and knowledgeable assistant.
                    Answer the user's question using **only** the provided context.
                    - If the question can be answered with "Yes," "No," or a single word, respond with that word **and nothing else**.
                    - Do **not** add any explanation, commentary, or additional information if a short answer suffices.
                    - If the answer requires more than one word, provide a brief and precise response strictly based on the context.
                    - Do not use any knowledge outside the provided context.
                    - Always keep your answer clear, concise, and directly relevant to the question.
                """

  prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
  message = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": prompt}
  ]
  return message

In [None]:
model, tokenizer = llm_loader(llm_model_name)

In [None]:
vector_database = database_creator(df_infos, "text")

In [None]:
predict = []
for question in df_predict['question']:
  answer = llm_generator(model, tokenizer, question, vector_database, top_k=5, max_new_tokens=512)
  predict.append(answer)
df_predict['prediction'] = predict

In [None]:
df_predict.to_csv("predict.csv", index=False)