In [None]:
!pip install faiss-cpu bitsandbytes



In [None]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def skip(line, cell):
    """Skip execution of the current cell"""
    pass

In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import shutil
import os
import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
source_folder = '/content/drive/MyDrive/NLP_Paper_Review/dataset'
destination_folder = '/content/dataset'

if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Copy the folder and its contents
shutil.copytree(source_folder, destination_folder, dirs_exist_ok=True)

print(f"Files copied from {source_folder} to {destination_folder}")

Files copied from /content/drive/MyDrive/NLP_Paper_Review/dataset to /content/dataset


In [None]:
data = []
with open('dataset/training_data.json', 'r') as f:
  train_data: list[dict] = json.load(f)

# with open('dataset/astronomy_augmented_1.json', 'r') as f:
#   augmented_data_1: list[dict] = json.load(f)

with open('dataset/astronomy_augmented_2.json', 'r', encoding='UTF-8') as f:
  augmented_data_2: list[dict] = json.load(f)

data += train_data + augmented_data_2 # + augmented_data_1
len(data)

3472

In [None]:
data[0].keys()

dict_keys(['question', 'choices', 'answer'])

In [None]:
chunks = []
for item in data :
  question = item['question']
  choices = item['choices']
  answer_idx = item['answer']
  answer = choices[answer_idx]

  chunk = f'Question: {question} \nAnswer: {answer}'
  chunks.append(chunk)

chunks[0:5]


['Question: According to the article, what is an asteroid? \nAnswer: An object larger than a meteoroid that is neither a planet nor an identified comet, orbiting within the inner Solar System or co-orbital with Jupiter',
 'Question: What are the broad classifications of asteroids based on composition? \nAnswer: Carbonaceous, Metallic, Silicaceous',
 'Question: Where are the greatest number of known asteroids located? \nAnswer: Between the orbits of Mars and Jupiter',
 'Question: Which of the following spacecraft directly studied Vesta and Ceres? \nAnswer: Dawn',
 'Question: What potentially catastrophic event is associated with Near-Earth asteroids? \nAnswer: Colliding with Earth and causing mass extinction events']

In [None]:
embedding_model = SentenceTransformer("BAAI/bge-large-en")
chunk_embeddings = embedding_model.encode(chunks, normalize_embeddings=True)

In [None]:
dimension = chunk_embeddings.shape[1]
dimension

1024

In [None]:
index = faiss.IndexFlatIP(dimension)
index.add(chunk_embeddings)

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"  # Or another model from the paper
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, token=HF_TOKEN)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
st_model = SentenceTransformer('all-MiniLM-L6-v2')
def get_pred_indexes(pred, dataset) :
  pred_indexes = []
  for ind, pred in enumerate(pred) :
    choices = dataset[ind]['choices']
    pred_embedding = st_model.encode([pred], normalize_embeddings=True)
    choice_embeddings = st_model.encode(choices, normalize_embeddings=True)
    similarities = cosine_similarity(pred_embedding, choice_embeddings)[0]
    # Find the most similar sentence
    most_similar_idx = int(np.argmax(similarities))
    pred_indexes.append(most_similar_idx)
  return pred_indexes

# Base

In [None]:
%%skip
def predict(question, options):

    # Format options
    options_str = "\n".join([f"{option}" for ind, option in enumerate(options)])

    instructions = "Answer the multiple choice question."

    # Create prompt
    augmented_prompt = f"{instructions}\nQuestion: {question}\nAnswer choices:\n{options_str}\n\nYour Answer:"

    # Generate response - move inputs to the same device as model
    inputs = tokenizer(augmented_prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move inputs to model's device

    # Generate only new tokens (the answer)
    prompt_length = inputs["input_ids"].shape[1]
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=50,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    # Extract only the newly generated tokens (the answer part)
    answer_tokens = outputs[0][prompt_length:]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    # print('answer', answer)
    answer_option = answer.strip(). split('\n')[0]
    return answer_option

In [None]:
%%skip
query = "How have we been able to construct detailed maps of surface features on Venus?"
options = ["by studying Venus from Earth with powerful optical telescopes","by landing spacecraft on the surface for close-up study","by studying Venus with powerful optical telescopes on spacecraft that were sent to orbit Venus","by using radar from spacecraft that were sent to orbit Venus"]
response = predict(query, options)
print(response)

In [None]:
%%skip
validation = json.load(open('dataset/mmlu_datasets/mmlu_astronomy_validation.json'))
test = json.load(open('dataset/mmlu_datasets/mmlu_astronomy_test.json'))

In [None]:
%%skip
from tqdm import tqdm
val_pred = []
val_actual = [item['answer'] for item in validation]
for item in tqdm(validation, desc='Validating'):
  pred = predict(item['question'], item['choices'])
  val_pred.append(pred)

In [None]:
%%skip
val_pred[0:5]

In [None]:
%%skip
val_pred_indexes = get_pred_indexes(val_pred, validation)
val_pred_indexes, val_actual

In [None]:
%%skip
accuracy = (np.array(val_pred_indexes) == np.array(val_actual)).mean()
accuracy

In [None]:
%%skip
data = []
for ind, item in enumerate(validation):
    # Extract the question, choices, and answer index
    question = item['question']
    choices = item['choices']
    answer_idx = item['answer']

    # Create a dictionary for this row
    row = {
        'question': question,
        'option_0': choices[0],
        'option_1': choices[1],
        'option_2': choices[2],
        'option_3': choices[3],
        'answer_idx': answer_idx,
        'predicted_idx': val_pred_indexes[ind]
    }

    # Add to our data list
    data.append(row)
df = pd.DataFrame(data)
df.head()
df.to_csv('base_validation_results.csv', index=False)

In [None]:
%%skip
test_pred = []
test_actual = [item['answer'] for item in test]
for item in tqdm(test, desc='Validating'):
  pred = predict(item['question'], item['choices'])
  test_pred.append(pred)

In [None]:
%%skip
test_pred_indexes = get_pred_indexes(test_pred, test)
test_pred_indexes[0:5], test_actual[0:5]

In [None]:
%%skip
accuracy = (np.array(test_pred_indexes) == np.array(test_actual)).mean()
accuracy

In [None]:
%%skip
import pandas as pd

data = []
for ind, item in enumerate(test):
    # Extract the question, choices, and answer index
    question = item['question']
    choices = item['choices']
    answer_idx = item['answer']

    # Create a dictionary for this row
    row = {
        'question': question,
        'option_0': choices[0],
        'option_1': choices[1],
        'option_2': choices[2],
        'option_3': choices[3],
        'answer_idx': answer_idx,
        'predicted_idx': test_pred_indexes[ind]
    }

    # Add to our data list
    data.append(row)
df = pd.DataFrame(data)
df.head()
df.to_csv('base_test_results.csv', index=False)

# RAG

In [None]:
def rag(question, options, k=3):
    # Generate query embedding and normalize
    question_embedding = embedding_model.encode([question], normalize_embeddings=True)

    # Retrieve top-k similar chunks
    scores, indices = index.search(question_embedding, k)

    # Construct context from retrieved chunks
    context = "\n\nSample ".join([chunks[idx] for idx in indices[0]])

    # Format options
    options_str = "\n".join([f"{option}" for ind, option in enumerate(options)])

    instructions = "Answer the multiple choice question at the end."

    # Create prompt
    augmented_prompt = f"{instructions}\n\n-----Beginning of Context------:\n{context}\n------End of Context-------\n\n\nQuestion to Answer: {question}\nAnswer Choices:\n{options_str}\n\nWhat is your Answer?"

    # Generate response - move inputs to the same device as model
    inputs = tokenizer(augmented_prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move inputs to model's device

    # Generate only new tokens (the answer)
    prompt_length = inputs["input_ids"].shape[1]
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=50,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    # Extract only the newly generated tokens (the answer part)
    answer_tokens = outputs[0][prompt_length:]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    answer_option = answer.strip().split('\n')[0]
    return answer_option
    try:
      if answer_option[0] in ['1','2','3','4'] : return int(answer_option[0]) - 1
    except:
      print(answer_option)
    return answer_option


In [None]:
query = "How have we been able to construct detailed maps of surface features on Venus?"
options = ["by studying Venus from Earth with powerful optical telescopes","by landing spacecraft on the surface for close-up study","by studying Venus with powerful optical telescopes on spacecraft that were sent to orbit Venus","by using radar from spacecraft that were sent to orbit Venus"]
response = rag(query, options)
print(response)



Answer: by using radar from spacecraft that were sent to orbit Venus


In [None]:
validation = json.load(open('dataset/mmlu_datasets/mmlu_astronomy_validation.json'))
test = json.load(open('dataset/mmlu_datasets/mmlu_astronomy_test.json'))

In [None]:
val_pred = []
val_actual = [item['answer'] for item in validation]
for item in tqdm(validation, desc='Validating'):
  pred = rag(item['question'], item['choices'], k=3)
  val_pred.append(pred)

Validating: 100%|██████████| 16/16 [01:45<00:00,  6.57s/it]


In [None]:
val_pred[0:5]

['Power emitted is 1/16 times as high; peak emission wavelength is 2 times longer.',
 'Answer: The minimization of gravitational potential energy.',
 "Answer: Large impacts fractured the Moon's lithosphere allowing lava to fill the impact basins.",
 'Answer: Its rotation axis is nearly perpendicular to the plane of the Solar System.',
 'Answer: The answer is 5 : 1.']

In [None]:
val_pred_indexes = get_pred_indexes(val_pred, validation)
val_pred_indexes, val_actual

([3, 1, 3, 0, 2, 1, 3, 3, 0, 0, 1, 0, 3, 1, 3, 0],
 [3, 1, 3, 0, 2, 1, 3, 3, 0, 0, 1, 0, 2, 1, 3, 0])

In [None]:
accuracy = (np.array(val_pred_indexes) == np.array(val_actual)).mean()
accuracy

np.float64(0.9375)

In [None]:
data = []
for ind, item in enumerate(validation):
    # Extract the question, choices, and answer index
    question = item['question']
    choices = item['choices']
    answer_idx = item['answer']

    # Create a dictionary for this row
    row = {
        'question': question,
        'option_0': choices[0],
        'option_1': choices[1],
        'option_2': choices[2],
        'option_3': choices[3],
        'answer_idx': answer_idx,
        'predicted_idx': val_pred_indexes[ind]
    }

    # Add to our data list
    data.append(row)
df = pd.DataFrame(data)
df.head()
df.to_csv('rag_validation_results.csv', index=False)

In [None]:
test_pred = []
test_actual = [item['answer'] for item in test]
for item in tqdm(test, desc='Testing'):
  pred = rag(item['question'], item['choices'], k=3)
  test_pred.append(pred)

Testing: 100%|██████████| 152/152 [19:04<00:00,  7.53s/it]


In [None]:
test_pred_indexes = get_pred_indexes(test_pred, test)
test_pred_indexes[0:5], test_actual[0:5]

([0, 3, 2, 1, 0], [0, 3, 2, 2, 3])

In [None]:
accuracy = (np.array(test_pred_indexes) == np.array(test_actual)).mean()
accuracy

np.float64(0.6118421052631579)

In [None]:
import pandas as pd

data = []
for ind, item in enumerate(test):
    # Extract the question, choices, and answer index
    question = item['question']
    choices = item['choices']
    answer_idx = item['answer']

    # Create a dictionary for this row
    row = {
        'question': question,
        'option_0': choices[0],
        'option_1': choices[1],
        'option_2': choices[2],
        'option_3': choices[3],
        'answer_idx': answer_idx,
        'predicted_idx': test_pred_indexes[ind]
    }

    # Add to our data list
    data.append(row)
df = pd.DataFrame(data)
df.head()
df.to_csv('rag_test_results.csv', index=False)