In [10]:
!pip install transformers
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import torch
import nltk
import random
from tqdm import tqdm
import json




In [14]:
import requests

def download_squad(version=1.1):
    assert version in [1.1, 2.0], "Version must be either 1.1 or 2.0"
    base_url = f"https://rajpurkar.github.io/SQuAD-explorer/dataset/"
    train_file = f"train-v{version}.json"
    dev_file = f"dev-v{version}.json"

    for file in [train_file, dev_file]:
        url = base_url + file
        response = requests.get(url)
        with open(file, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file}")

def load_squad(version=2.0):
    train_file = f"train-v{version}.json"
    dev_file = f"dev-v{version}.json"

    with open(train_file, 'r', encoding='utf-8') as f:
        train_data = json.load(f)

    with open(dev_file, 'r', encoding='utf-8') as f:
        dev_data = json.load(f)

    return {'train': train_data['data'], 'validation': dev_data['data']}

In [15]:
download_squad(2.0)
squad = load_squad(2.0)

Downloaded train-v2.0.json
Downloaded dev-v2.0.json


In [23]:

nltk.download('punkt')
from tqdm import tqdm

def preprocess_squad(dset):
    examples = []

    for article in tqdm(dset):
        for paragraph in article['paragraphs']:
            context = paragraph['context']

            for qa in paragraph['qas']:
                question = qa['question']
                answers = qa.get('answers', [])

                if len(answers) == 0 or answers[0]['text'] == "":
                    # Handle unanswerable questions
                    # You can decide how to handle these. Here's an example:
                    examples.append({'question': question, 'sentence': context, 'label': 0})
                    continue

                # Processing answerable questions
                answer_start = answers[0]['answer_start']
                answer_text = answers[0]['text']
                answer_end = answer_start + len(answer_text)

                pos_sent = None
                sent_start = 0
                sentences = nltk.sent_tokenize(context)
                for sentence in sentences:
                    sent_end = sent_start + len(sentence)
                    if sent_start <= answer_start and sent_end >= answer_end:
                        pos_sent = sentence
                        break
                    sent_start = sent_end

                # Create positive and negative examples
                if pos_sent:
                    examples.append({'question': question, 'sentence': pos_sent, 'label': 1})

                    neg_sentences = [s for s in sentences if s != pos_sent]
                    if neg_sentences:
                        neg_sent = random.choice(neg_sentences)
                        examples.append({'question': question, 'sentence': neg_sent, 'label': 0})

    return examples



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [24]:

train_examples  =   preprocess_squad(squad['train'])
train_dloader   =   DataLoader(train_examples, shuffle=True,batch_size=8)

val_examples    =   preprocess_squad(squad['validation'])

100%|██████████| 442/442 [00:15<00:00, 29.35it/s]
100%|██████████| 35/35 [00:00<00:00, 35.31it/s]


In [None]:
from sentence_transformers import CrossEncoder

cross_model = CrossEncoder('distilroberta-base', num_labels=1)

In [None]:
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

evaluator   =   CEBinaryClassificationEvaluator.from_input_examples(val_examples)

In [None]:
import wandb
wandb.init(project="Contextual_Compressor")

def callback_model(score, epoch, steps):
    wandb.log({"train/epoch" : epoch,
                "train/steps": steps,
                "train/score" : score})

In [None]:
num_epochs  =   4
warmup_prop =   0.1
eval_steps  =  1000

cross_model.fit(train_dataloader=train_dloader,
        evaluator=evaluator,
        epochs=num_epochs,
        warmup_steps=warmup_prop*num_epochs*len(train_dloader),
        evaluation_steps=eval_steps,
        callback=callback_model,
        show_progress_bar=True)

In [None]:
context, question   =   squad['validation'][4]['context'], squad['validation'][4]['question']

sentences   =   nltk.sent_tokenize(context)

print(question)
print("---- \n")
for sentence in sentences:
    print(sentence)
    print(cross_model.predict([sentence, question]))
    print("\n")