In [1]:
!pip install evaluate rouge-score sentence-transformers > /dev/null 2>&1;

In [2]:
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import evaluate
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [3]:
rouge=evaluate.load("rouge")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

device(type='cuda')

In [4]:
answersumm = load_dataset("alexfabbri/answersumm")

README.md:   0%|          | 0.00/9.74k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/24.8M [00:00<?, ?B/s]

validation.jsonl:   0%|          | 0.00/4.43M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/8.76M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2783 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
train_data=answersumm['train']
query_dataset=[]
ref_summ_dataset=[]
sentence_dataset=[]
for sample in train_data:
    ref_summ=sample['summaries'][0][0]
    query=sample['question']['question']
    for ans in sample['answers']:
        for sent in ans['sents']:
            sentence_dataset.append(sent['text'])
            ref_summ_dataset.append(ref_summ)
            query_dataset.append(query)

In [6]:
def compute_rouge_score(sent,ref_summary):
    results=rouge.compute(predictions=[sent], references=[ref_summary])
    return [results['rouge1'], results['rouge2'], results['rougeL']]

In [7]:
class RelRegDataset(Dataset):
    def __init__(self, max_len, queries, sentences, summaries):
        self.max_len = max_len
        self.queries=queries
        self.summaries=summaries
        self.sentences=sentences
        def __len__(self):
            return len(self.queries)
        def __getitem__(self, idx):
            sentence = self.sentences[idx]
            query = self.queries[idx]
            summary = self.summaries[idx]
            return {
                'query': self.queries[idx],
                'summary': self.summary[idx],
                'sentence': self.sentence[idx]
            }

In [8]:
class RelRegModelTT(nn.Module):
    def __init__(self, model_name="multi-qa-mpnet-base-cos-v1"):
        super(RelRegModelTT, self).__init__()
        self.encoder = SentenceTransformer(model_name)
        def forward(self, query, sentence):
            query_embedding = self.encoder.encode(query)
            sentence_embedding = self.encoder.encode(sentence)
            return self.encoder(query_embedding, sentence_embedding)

In [9]:
model=SentenceTransformer("multi-qa-mpnet-base-cos-v1")
model=model.to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
test_data=answersumm['test']
query_test_dataset=[]
ref_summ_test_dataset=[]
sentence_test_dataset=[]
for sample in test_data:
    ref_summ=sample['summaries'][0][0]
    query=sample['question']['question']
    for ans in sample['answers']:
        for sent in ans['sents']:
            sentence_test_dataset.append(sent['text'])
            ref_summ_test_dataset.append(ref_summ)
            query_test_dataset.append(query)

In [11]:
from transformers import BartForConditionalGeneration, BartTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
model_name = "facebook/bart-large"
tokenizer2 = BartTokenizer.from_pretrained(model_name)
modelBART=torch.load('../input/bart-ft2/BART_FT2.pth')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

  modelBART=torch.load('../input/bart-ft2/BART_FT2.pth')


In [13]:
def generate_summary(input_text):
    inputs = tokenizer2(input_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = modelBART.generate(inputs["input_ids"].to(device), max_length=256, min_length=10, length_penalty=2.0, num_beams=4)
    summary = tokenizer2.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [14]:
curr_data=answersumm['test']
scores_5_cosine=[]
scores_10_cosine=[]
scores_15_cosine=[]
scores_5_eucledian=[]
scores_10_eucledian=[]
scores_15_eucledian=[]
for sample in tqdm(curr_data):
    ref_summ=sample['summaries'][0][1]
    query=sample['question']['question']
    sents=[]
    temp_scores_cosine=[]
    temp_scores_eucledian=[]
    for ans in sample['answers']:
        for sent in ans['sents']:
            sentence = sent['text']
            query_embedding = model.encode(query,show_progress_bar=False)
            sentence_embedding = model.encode(sentence,show_progress_bar=False)
            sents.append(sent['text'])
            
            temp_scores_cosine.extend(cosine_similarity(query_embedding.reshape(1, -1),sentence_embedding.reshape(1,-1)).tolist())
            temp_scores_eucledian.extend(euclidean_distances(query_embedding.reshape(1, -1),sentence_embedding.reshape(1,-1)).tolist())
    sorted_strings_cosine = [string for _, string in sorted(zip(temp_scores_cosine, sents), reverse=True)]
    sorted_strings_eucledian = [string for _, string in sorted(zip(temp_scores_cosine, sents))]
    inp_str=""
    for s in sorted_strings_cosine[:5]:
        inp_str+=s
    scores_5_cosine.append(compute_rouge_score(generate_summary(inp_str),ref_summ))
    inp_str=""
    for s in sorted_strings_cosine[:10]:
        inp_str+=s
    scores_10_cosine.append(compute_rouge_score(generate_summary(inp_str),ref_summ))
    inp_str=""
    for s in sorted_strings_cosine[:15]:
        inp_str+=s
    scores_15_cosine.append(compute_rouge_score(generate_summary(inp_str),ref_summ))
    inp_str=""
    for s in sorted_strings_eucledian[:5]:
        inp_str+=s
    scores_5_eucledian.append(compute_rouge_score(generate_summary(inp_str),ref_summ))
    inp_str=""
    for s in sorted_strings_eucledian[:10]:
        inp_str+=s
    scores_10_eucledian.append(compute_rouge_score(generate_summary(inp_str),ref_summ))
    inp_str=""
    for s in sorted_strings_eucledian[:15]:
        inp_str+=s
    scores_15_eucledian.append(compute_rouge_score(generate_summary(inp_str),ref_summ))

100%|██████████| 1000/1000 [1:12:58<00:00,  4.38s/it]


In [16]:
a,b,c=0,0,0
for x in scores_5_cosine:
    a+=x[0]
    b+=x[1]
    c+=x[2]
a/len(scores_5_cosine),b/len(scores_5_cosine),c/len(scores_5_cosine)

(0.23144844626588743, 0.06546615940531919, 0.1737365983473697)

In [17]:
a,b,c=0,0,0
for x in scores_10_cosine:
    a+=x[0]
    b+=x[1]
    c+=x[2]
a/len(scores_10_cosine),b/len(scores_10_cosine),c/len(scores_10_cosine)

(0.23935174943288806, 0.07007853177814942, 0.18029960125166022)

In [18]:
a,b,c=0,0,0
for x in scores_15_cosine:
    a+=x[0]
    b+=x[1]
    c+=x[2]
a/len(scores_5_cosine),b/len(scores_5_cosine),c/len(scores_5_cosine)

(0.24218765318326532, 0.0690588222492297, 0.17943941500000937)

In [19]:
a,b,c=0,0,0
for x in scores_5_eucledian:
    a+=x[0]
    b+=x[1]
    c+=x[2]
a/len(scores_5_cosine),b/len(scores_5_cosine),c/len(scores_5_cosine)

(0.10622871238777794, 0.011692612915443728, 0.08472420029944985)

In [20]:
a,b,c=0,0,0
for x in scores_10_eucledian:
    a+=x[0]
    b+=x[1]
    c+=x[2]
a/len(scores_5_cosine),b/len(scores_5_cosine),c/len(scores_5_cosine)

(0.13918807314054718, 0.021061568468094462, 0.10721749062717931)

In [21]:
a,b,c=0,0,0
for x in scores_15_eucledian:
    a+=x[0]
    b+=x[1]
    c+=x[2]
a/len(scores_5_cosine),b/len(scores_5_cosine),c/len(scores_5_cosine)

(0.1618418405569008, 0.030161258737256336, 0.12277615174146161)