In [1]:
!pip install transformers
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import torch
import nltk
import random
from tqdm import tqdm
import json




In [2]:
import requests

def download_squad(version=1.1):
    assert version in [1.1, 2.0], "Version must be either 1.1 or 2.0"
    base_url = f"https://rajpurkar.github.io/SQuAD-explorer/dataset/"
    train_file = f"train-v{version}.json"
    dev_file = f"dev-v{version}.json"

    for file in [train_file, dev_file]:
        url = base_url + file
        response = requests.get(url)
        with open(file, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file}")

def load_squad(version=2.0):
    train_file = f"train-v{version}.json"
    dev_file = f"dev-v{version}.json"

    with open(train_file, 'r', encoding='utf-8') as f:
        train_data = json.load(f)

    with open(dev_file, 'r', encoding='utf-8') as f:
        dev_data = json.load(f)

    return {'train': train_data['data'], 'validation': dev_data['data']}

In [3]:
download_squad(2.0)
squad = load_squad(2.0)

Downloaded train-v2.0.json
Downloaded dev-v2.0.json


In [4]:

nltk.download('punkt')
from tqdm import tqdm

def preprocess_squad(dset):
    examples = []

    for article in tqdm(dset):
        for paragraph in article['paragraphs']:
            context = paragraph['context']

            for qa in paragraph['qas']:
                question = qa['question']
                answers = qa.get('answers', [])

                if len(answers) == 0 or answers[0]['text'] == "":
                    # Handle unanswerable questions
                    # You can decide how to handle these. Here's an example:
                    examples.append({'question': question, 'sentence': context, 'label': 0})
                    continue

                # Processing answerable questions
                answer_start = answers[0]['answer_start']
                answer_text = answers[0]['text']
                answer_end = answer_start + len(answer_text)

                pos_sent = None
                sent_start = 0
                sentences = nltk.sent_tokenize(context)
                for sentence in sentences:
                    sent_end = sent_start + len(sentence)
                    if sent_start <= answer_start and sent_end >= answer_end:
                        pos_sent = sentence
                        break
                    sent_start = sent_end

                # Create positive and negative examples
                if pos_sent:
                    examples.append({'question': question, 'sentence': pos_sent, 'label': 1})

                    neg_sentences = [s for s in sentences if s != pos_sent]
                    if neg_sentences:
                        neg_sent = random.choice(neg_sentences)
                        examples.append({'question': question, 'sentence': neg_sent, 'label': 0})

    return examples



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:

train_examples  =   preprocess_squad(squad['train'])
train_dloader   =   DataLoader(train_examples, shuffle=True,batch_size=8)

val_examples    =   preprocess_squad(squad['validation'])


100%|██████████| 442/442 [00:23<00:00, 18.58it/s]
100%|██████████| 35/35 [00:01<00:00, 24.78it/s]


In [6]:
from pprint import pprint
pprint(train_examples[:5])

[{'label': 1,
  'question': 'When did Beyonce start becoming popular?',
  'sentence': 'Born and raised in Houston, Texas, she performed in various '
              'singing and dancing competitions as a child, and rose to fame '
              "in the late 1990s as lead singer of R&B girl-group Destiny's "
              'Child.'},
 {'label': 0,
  'question': 'When did Beyonce start becoming popular?',
  'sentence': "Their hiatus saw the release of Beyoncé's debut album, "
              'Dangerously in Love (2003), which established her as a solo '
              'artist worldwide, earned five Grammy Awards and featured the '
              'Billboard Hot 100 number-one singles "Crazy in Love" and "Baby '
              'Boy".'},
 {'label': 1,
  'question': 'What areas did Beyonce compete in when she was growing up?',
  'sentence': 'Born and raised in Houston, Texas, she performed in various '
              'singing and dancing competitions as a child, and rose to fame '
              "in th

In [7]:
!pip install sentence-transformers
from sentence_transformers import CrossEncoder


Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [None]:

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sentence_transformers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator


kf = KFold(n_splits=5)  # 5-fold cross-validation
fold_results = {}
def format_to_example(data):
  return [InputExample(texts=[item['question'], item['sentence']], label=item['label']) for item in data]

train_examples2 = format_to_example(train_examples)
val_examples2 = format_to_example(val_examples)
evaluator = CEBinaryClassificationEvaluator.from_input_examples(train_examples2)
train_dloader = DataLoader(train_examples2, shuffle=True, batch_size=8)

    # Initialize Cross-Encoder with the 'distilroberta-base' model
cross_model = CrossEncoder('distilroberta-base', num_labels=1)

    # Configure the training
num_epochs = 4  # Adjust the number of epochs based on your needs
warmup_steps = int(len(train_dloader) * num_epochs * 0.1)  # 10% of train data for warm-up


    # Train model
cross_model.fit(train_dataloader=train_dloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          evaluation_steps=1000,  # Adjust as needed
          output_path=f'relevancy/model')

    # Save training and validation loss for plotting
print("training_loss: ", cross_model.history('train_loss'), "validation_loss: ", cross_model.history('val_loss'))



# for fold, (train_ids, val_ids) in enumerate(kf.split(dataset)):
#     print(f"Training on fold {fold+1}")

#     def format_to_example(data):
#         return [InputExample(texts=[item['question'], item['sentence']], label=item['label']) for item in data]

#     train_examples2 = format_to_example(train_examples)
#     val_examples2 = format_to_example(val_examples)

#     train_dloader = DataLoader(train_examples2, shuffle=True, batch_size=8)

#     # Initialize Cross-Encoder with the 'distilroberta-base' model
#     cross_model = CrossEncoder('distilroberta-base', num_labels=1)

#     # Configure the training
#     num_epochs = 4  # Adjust the number of epochs based on your needs
#     warmup_steps = int(len(train_dloader) * num_epochs * 0.1)  # 10% of train data for warm-up


#     # Train model
#     cross_model.fit(train_dataloader=train_dloader,
#               evaluator=evaluator,
#               epochs=num_epochs,
#               warmup_steps=warmup_steps,
#               evaluation_steps=1000,  # Adjust as needed
#               output_path=f'relevancy/model_fold_{fold+1}')

#     # Save training and validation loss for plotting
#     fold_results[fold] = {
#         "training_loss": cross_model.history('train_loss'),
#         "validation_loss": cross_model.history('val_loss')
#     }

# Plotting the results
# for fold in fold_results:
#     plt.plot(fold_results[fold]["training_loss"], label=f"Training Loss Fold {fold+1}")
#     plt.plot(fold_results[fold]["validation_loss"], label=f"Validation Loss Fold {fold+1}")

# plt.title("Training and Validation Loss per Fold")
# plt.xlabel("Epochs")
# plt.ylabel("Loss")
# plt.legend()
# plt.show()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/24667 [00:00<?, ?it/s]

In [None]:
context, question   =   squad['validation'][4]['context'], squad['validation'][4]['question']

sentences   =   nltk.sent_tokenize(context)

print(question)
print("---- \n")
for sentence in sentences:
    print(sentence)
    print(cross_model.predict([sentence, question]))
    print("\n")