In [18]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import pandas as pd

In [None]:
train_data = pd.read_json('/content/train.jsonl', lines=True)
validate_data = pd.read_json('/content/validation.jsonl', lines=True)

In [None]:
#filter only phrase tags for Q&A task
train_data['tags']=train_data['tags'].apply(lambda x:x[0])
train_data=train_data[train_data['tags']=='multi'].reset_index(drop=True)

#filter only phrase tags for Q&A task
validate_data['tags']=validate_data['tags'].apply(lambda x:x[0])
validate_data=validate_data[validate_data['tags']=='multi'].reset_index(drop=True)

In [None]:
train_data.head()

In [None]:
# data processing to get context and spoiler start position
def create_df(dataset):

    col1=[]
    col2=[]

    input = []
    output = []
    res = []

    for idx in range(len(dataset)):
        concatlen=0
        line_occuring=dataset['spoilerPositions'][idx][0][0][0]

        for i in range(line_occuring):
            concatlen+=len(dataset['targetParagraphs'][idx][i])


        starting_position=concatlen+dataset['spoilerPositions'][idx][0][0][1]

        col1.append({'answer_start': [starting_position], 'text': dataset['spoiler'][idx]})


        l=''
        for line in dataset['targetParagraphs'][idx]:
            l+=line
        
        col2.append(l)

    df = pd.DataFrame(list(zip(col1, col2)),
    columns =['answers', 'context'])
    df['question']=dataset['postText'].apply(lambda x: x[0])
    df['id']=dataset['postId']

    json_data=[]
    for index, row in df.iterrows():
        json_data.append({
            'text': row['question'] + " " + row['context'],
            'summary': ' '.join(row['answers']['text'])
        })
       
    return json_data

In [None]:
import json

train_json=create_df(train_data)
validate_json=create_df(validate_data)

with open('train_json.json', 'w') as f:
    json.dump(train_json, f)

with open('validate_json.json', 'w') as f:
    json.dump(validate_json, f)



In [None]:
from huggingface_hub import notebook_login
# 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
notebook_login()

In [None]:
from datasets import load_dataset, load_metric
from transformers import BartForConditionalGeneration, BartTokenizerFast, Trainer, TrainingArguments
from transformers import default_data_collator
import numpy as np

def collate(data):
  return {
          "input_ids": tokenizer(data[0]["text"], padding=True, truncation=True, max_length=512, return_tensors='pt')["input_ids"], 
          "attention_mask": tokenizer(data[0]["text"], padding=True, truncation=True, max_length=512, return_tensors='pt')["attention_mask"], 
          "decoder_input_ids": tokenizer(data[0]["summary"], padding=True, truncation=True, max_length=512, return_tensors='pt')["input_ids"], 
          "decoder_attention_mask": tokenizer(data[0]["summary"], padding=True, truncation=True, max_length=512, return_tensors='pt')["attention_mask"], 
          "labels": tokenizer(data[0]["summary"], padding=True, truncation=True, max_length=512, return_tensors='pt')["input_ids"]
          }
   
# Load the training and validation datasets
train_dataset = load_dataset('json', data_files={'train':'train_json.json'})["train"]
print(train_dataset[0])
validate_dataset = load_dataset('json', data_files={'validation':'validate_json.json'})["validation"]
print(validate_dataset.shape)

# Load the BART tokenizer and model
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
    logging_steps=500,
    save_steps=500,
    remove_unused_columns = False
    
)

# Define the metric for evaluation
metric = load_metric("bleu")

# Define the function to compute the metric
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions
#     preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     return metric.compute(predictions=preds, references=labels)

# def compute_metrics(pred):
#     print(pred.label_ids.shape)
#     labels = pred.label_ids.tolist()
#     preds = pred.predictions
#     print(labels)
#     print(preds[0].shape)
#     return metric.compute(predictions=preds[0].tolist(), references=labels)

# def compute_metrics(pred):
#     print(pred.label_ids.shape)
#     labels = pred.label_ids.tolist()
#     preds = pred.predictions
#     print(labels)
#     print(preds[0].shape)
#     pred_list = [pred.tolist() for pred in preds[0]]
#     return metric.compute(predictions=pred_list, references=labels)

# def compute_metrics(pred):
#     labels = pred.label_ids.tolist()
#     preds = pred.predictions[0].reshape(-1, pred.predictions[0].shape[-1]).argmax(axis=1).tolist()
#     return metric.compute(predictions=preds, references=labels)

# import numpy as np[]

def compute_metrics(pred):
    labels = pred.label_ids.tolist()
    preds = pred.predictions[0].reshape(-1, pred.predictions[0].shape[-1]).argmax(axis=1)
    preds = preds.ravel().tolist()
    return metric.compute(predictions=[preds], references=labels)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    
    # data_collator = default_data_collator,
    data_collator= collate,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    
)

# Train the model
trainer.train()

# Evaluate the model on the validation data using BLEU
trainer.evaluate()