In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [14]:
import torch
from transformers import BartForSequenceClassification, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

In [15]:
dataset = load_dataset('boolq')

Found cached dataset boolq (/root/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2 labels: True/False

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json: 0.00B [00:00, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_and_format(examples):
    inputs = tokenizer(
        examples['question'],
        examples['passage'],
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=512
    )
    labels = [1 if answer==True else 0 for answer in examples['answer']]
    inputs['labels'] = torch.tensor(labels, dtype=torch.long)
    return inputs


In [None]:
train_dataset = dataset['train'].map(tokenize_and_format, batched=True)
valid_dataset = dataset['validation'].map(tokenize_and_format, batched=True)

In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,  # Add weight decay for regularization
    learning_rate=5e-6,  # Lower the learning rate
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
)


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6556,0.633051
2,0.627,0.611979
3,0.6021,0.627204


TrainOutput(global_step=3537, training_loss=0.6168063187754218, metrics={'train_runtime': 3646.2788, 'train_samples_per_second': 7.756, 'train_steps_per_second': 0.97, 'total_flos': 8673422187386880.0, 'train_loss': 0.6168063187754218, 'epoch': 3.0})

In [10]:
trainer.save_model('./boolq-bart-classifier')

In [24]:
from transformers import BartForSequenceClassification, BartTokenizer

saved_model_path = './boolq-bart-classifier'

modelZ = BartForSequenceClassification.from_pretrained(saved_model_path)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


In [25]:
import torch

def predict(question, passage):
    inputs = tokenizer(question, passage, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = modelZ(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probabilities, dim=-1).item()
    return 'True' if predicted_label == 1 else 'False'

In [26]:
question = dataset['validation']["question"][0]
passage = dataset['validation']["passage"][0]

prediction = predict(question, passage)
print("Prediction:", prediction, "Answer:",dataset['validation']["answer"][0])

Prediction: False Answer: False


In [33]:
type(dataset['validation']["passage"][0])

str

In [31]:
count = 0
for i in range(100):
  question = dataset['validation']["question"][i]
  passage = dataset['validation']["passage"][i]

  prediction = predict(question, passage)
  print("No:",i,"Prediction:", prediction, "Answer:",dataset['validation']["answer"][0])
  if prediction == str(dataset['validation']["answer"][i]):
    count +=1

print("Accuracy with gold paragraphs:",count/100)

No: 0 Prediction: False Answer: False
No: 1 Prediction: True Answer: False
No: 2 Prediction: True Answer: False
No: 3 Prediction: True Answer: False
No: 4 Prediction: True Answer: False
No: 5 Prediction: True Answer: False
No: 6 Prediction: True Answer: False
No: 7 Prediction: False Answer: False
No: 8 Prediction: True Answer: False
No: 9 Prediction: True Answer: False
No: 10 Prediction: False Answer: False
No: 11 Prediction: False Answer: False
No: 12 Prediction: True Answer: False
No: 13 Prediction: True Answer: False
No: 14 Prediction: True Answer: False
No: 15 Prediction: True Answer: False
No: 16 Prediction: False Answer: False
No: 17 Prediction: True Answer: False
No: 18 Prediction: True Answer: False
No: 19 Prediction: True Answer: False
No: 20 Prediction: False Answer: False
No: 21 Prediction: True Answer: False
No: 22 Prediction: False Answer: False
No: 23 Prediction: True Answer: False
No: 24 Prediction: False Answer: False
No: 25 Prediction: True Answer: False
No: 26 Predict

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


No: 54 Prediction: False Answer: False
No: 55 Prediction: True Answer: False
No: 56 Prediction: True Answer: False
No: 57 Prediction: False Answer: False
No: 58 Prediction: True Answer: False
No: 59 Prediction: False Answer: False
No: 60 Prediction: True Answer: False
No: 61 Prediction: True Answer: False
No: 62 Prediction: True Answer: False
No: 63 Prediction: True Answer: False
No: 64 Prediction: True Answer: False
No: 65 Prediction: False Answer: False
No: 66 Prediction: True Answer: False
No: 67 Prediction: False Answer: False
No: 68 Prediction: True Answer: False
No: 69 Prediction: False Answer: False
No: 70 Prediction: False Answer: False
No: 71 Prediction: True Answer: False
No: 72 Prediction: False Answer: False
No: 73 Prediction: True Answer: False
No: 74 Prediction: True Answer: False
No: 75 Prediction: False Answer: False
No: 76 Prediction: True Answer: False
No: 77 Prediction: False Answer: False
No: 78 Prediction: True Answer: False
No: 79 Prediction: True Answer: False
No

In [34]:
count = 0
for i in range(100):
  question = dataset['validation']["question"][i]
  passage = ""

  prediction = predict(question, passage)
  print("No:",i,"Prediction:", prediction, "Answer:",dataset['validation']["answer"][0])
  if prediction == str(dataset['validation']["answer"][i]):
    count +=1

print("Accuracy without gold paragraphs:",count/100)

No: 0 Prediction: False Answer: False
No: 1 Prediction: False Answer: False
No: 2 Prediction: True Answer: False
No: 3 Prediction: False Answer: False
No: 4 Prediction: False Answer: False
No: 5 Prediction: False Answer: False
No: 6 Prediction: True Answer: False
No: 7 Prediction: False Answer: False
No: 8 Prediction: True Answer: False
No: 9 Prediction: True Answer: False
No: 10 Prediction: True Answer: False
No: 11 Prediction: False Answer: False
No: 12 Prediction: False Answer: False
No: 13 Prediction: False Answer: False
No: 14 Prediction: False Answer: False
No: 15 Prediction: True Answer: False
No: 16 Prediction: False Answer: False
No: 17 Prediction: False Answer: False
No: 18 Prediction: False Answer: False
No: 19 Prediction: False Answer: False
No: 20 Prediction: False Answer: False
No: 21 Prediction: False Answer: False
No: 22 Prediction: False Answer: False
No: 23 Prediction: False Answer: False
No: 24 Prediction: False Answer: False
No: 25 Prediction: True Answer: False
No:

In [None]:
!conda install -c conda-forge faiss-gpu

In [18]:
from transformers import RagTokenizer, RagTokenForGeneration
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from datasets import load_dataset
import torch
import faiss

In [19]:
# Define the model name
model_name = "facebook/rag-token-nq"

# Load the tokenizer
tokenizer = RagTokenizer.from_pretrained(model_name)

# Load the RAG model
model = RagTokenForGeneration.from_pretrained(model_name)

# Load the DPR encoders and tokenizers
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

Downloading pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.weight', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RagTokenForGeneration were not initialized from the model checkpoint at facebook/rag-token-nq and are newly initialized: ['rag.generator.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Downloading (…)lve/main/config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Downloading (…)lve/main/config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
!pip install apache-beam

In [39]:
# Load the dataset
dataset = load_dataset("wikipedia", '20220301.en', split="train")

Found cached dataset wikipedia (/root/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


In [None]:
# Create the text corpus
corpus = []
for example in dataset:
    corpus.append(example["text"])


# Encode the corpus
ctx_encodings = ctx_encoder(ctx_tokenizer(corpus, return_tensors="pt", padding=True, truncation=True)["input_ids"])

# Create a FAISS index
index = faiss.IndexFlatIP(ctx_encodings.size(-1))
index.add(ctx_encodings.cpu().numpy())

In [None]:
def generate_response(question: str, num_contexts: int = 5):
    # Encode the input question
    q_encoding = q_encoder(q_tokenizer(question, return_tensors="pt", padding=True, truncation=True)["input_ids"])

    # Retrieve the most relevant contexts using FAISS
    _, indices = index.search(q_encoding.cpu().numpy(), num_contexts)

    # Prepare the input for the RAG model
    input_ids = tokenizer.prepare_seq2seq_batch(
        [question] * num_contexts,
        context_input_ids=torch.tensor([ctx_tokenizer(corpus[idx.item()])["input_ids"] for idx in indices[0]]),
        return_tensors="pt",
        padding=True,
        truncation=True,
    )["input_ids"]

    # Generate a response using the RAG model
    generated_output = model.generate(input_ids)

    # Decode the generated tokens
    response = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    return response

# Example usage
question = "What is the capital of France?"
print(generate_response(question))
