In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile

zip_file_path = '/content/drive/MyDrive/BrainDead/PubMed.zip'
extract_path = '/content/drive/MyDrive/BrainDead'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Unzipped contents to: {extract_path}")


Unzipped contents to: /content/drive/MyDrive/BrainDead


In [None]:
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Text Summarization /PubMed/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Text Summarization /PubMed/test.csv')

In [None]:
train_df.head()

Unnamed: 0,article,abstract
0,a recent systematic analysis showed that in 20...,background : the present study was carried out...
1,it occurs in more than 50% of patients and may...,backgroundanemia in patients with cancer who a...
2,"tardive dystonia ( td ) , a rarer side effect ...",tardive dystonia ( td ) is a serious side effe...
3,"lepidoptera include agricultural pests that , ...",many lepidopteran insects are agricultural pes...
4,syncope is caused by transient diffuse cerebra...,we present an unusual case of recurrent cough ...


In [None]:
test_df.head()

Unnamed: 0,article,abstract
0,anxiety affects quality of life in those livin...,research on the implications of anxiety in par...
1,small non - coding rnas are transcribed into m...,"small non - coding rnas include sirna , mirna ..."
2,ohss is a serious complication of ovulation in...,objective : to evaluate the efficacy and safet...
3,congenital adrenal hyperplasia ( cah ) refers ...,congenital adrenal hyperplasia is a group of a...
4,type 1 diabetes ( t1d ) results from the destr...,objective(s):pentoxifylline is an immunomodula...


In [None]:
train_df = train_df.rename(columns={'article': 'Document', 'abstract': 'Summary'})
test_df = test_df.rename(columns={'article': 'Document', 'abstract': 'Summary'})

train_df.head()
test_df.head()


Unnamed: 0,Document,Summary
0,anxiety affects quality of life in those livin...,research on the implications of anxiety in par...
1,small non - coding rnas are transcribed into m...,"small non - coding rnas include sirna , mirna ..."
2,ohss is a serious complication of ovulation in...,objective : to evaluate the efficacy and safet...
3,congenital adrenal hyperplasia ( cah ) refers ...,congenital adrenal hyperplasia is a group of a...
4,type 1 diabetes ( t1d ) results from the destr...,objective(s):pentoxifylline is an immunomodula...


In [None]:
# prompt: print the length of train and test dataset

print("Train dataset length:", len(train_df))
print("Test dataset length:", len(test_df))


Train dataset length: 119924
Test dataset length: 6658


In [None]:
train_df1 = train_df.sample(n=30000, random_state=42)
test_df1 = test_df.sample(n=2000, random_state=42)

print("Train dataset length (sampled):", len(train_df1))
print("Test dataset length (sampled):", len(test_df1))


Train dataset length (sampled): 30000
Test dataset length (sampled): 2000


In [None]:
train_df1.head()

Unnamed: 0,Document,Summary
32536,long - term synaptic plasticity is thought to ...,understanding the spatiotemporal organization ...
543,californium-252 is an artificial element with ...,background : in neutron interaction with matte...
46953,ewing 's sarcoma is a malignant nonosteogenic ...,ewing 's sarcoma is the second most common mal...
3580,conventional endodontic treatment has experien...,the aim of the present roentgenographic in vit...
95214,choroidal osteoma ( choroidal osseous choristo...,"choroidal osteoma is a rare , benign tumor , u..."


In [None]:
import re

def preprocess_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = text.replace('\n', ' ')
    return text

In [None]:
train_df1['Document'] = train_df1['Document'].apply(preprocess_text)
train_df1['Summary'] = train_df1['Summary'].apply(preprocess_text)

test_df1['Document'] = test_df1['Document'].apply(preprocess_text)
test_df1['Summary'] = test_df1['Summary'].apply(preprocess_text)

In [None]:
from transformers import BertModel, T5ForConditionalGeneration, T5Tokenizer # Import T5Tokenizer
import torch

bert_model = BertModel.from_pretrained('bert-base-uncased')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

class HybridModel(torch.nn.Module):
    def __init__(self):
        super(HybridModel, self).__init__()
        self.bert = bert_model
        self.t5 = t5_model
        self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-base') # Initialize T5 tokenizer here
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Initialize Bert tokenizer here
        # Move models to the device
        self.bert.to(device)  # Move BERT model to device
        self.t5.to(device)  # Move T5 model to device


    def extract_key_sentences(self, bert_outputs, input_ids):
        """
        Extract key sentences based on BERT's attention weights.

        Parameters:
        - bert_outputs: Outputs from the BERT model.
        - input_ids: Input IDs for the BERT model.
        - tokenizer: BERT tokenizer instance.

        Returns:
        - A string of key sentences separated by spaces.
        """
        # Get the attention weights from BERT outputs
        attention_weights = bert_outputs.attentions[-1]  # Use the last layer's attention weights

        # Average the attention weights across all heads
        averaged_attention = torch.mean(attention_weights, dim=1)

        # Get the attention weights for the [CLS] token
        cls_attention = averaged_attention[:, 0, :]

        # Get the indices of the tokens with highest attention weights
        _, top_indices = torch.topk(cls_attention, k=5, dim=1)  # Extract top 5 tokens

        # Extract the corresponding tokens from the input IDs
        key_tokens = [input_ids[i, indices] for i, indices in enumerate(top_indices)]

        # Convert the tokens back to text using the BERT tokenizer
        key_sentences = [self.bert_tokenizer.decode(tokens) for tokens in key_tokens] # Use self.bert_tokenizer to decode


        # Join the key sentences into a single string
        key_sentences_str = " ".join(key_sentences)

        return key_sentences_str


    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask, output_attentions=True)
        key_sentences = self.extract_key_sentences(bert_outputs, input_ids)

        t5_inputs = self.t5_tokenizer(key_sentences, return_tensors="pt", text_target="dummy").to(input_ids.device)
        t5_input_ids = t5_inputs['input_ids']
        t5_attention_mask = t5_inputs['attention_mask']

        # Use T5 to generate logits instead of token IDs
        # Use teacher forcing with correct labels
        t5_outputs = self.t5(t5_input_ids, attention_mask=t5_attention_mask, labels=t5_input_ids)

        return t5_outputs.logits

hybrid_model = HybridModel()

In [None]:
from transformers import BertTokenizer
import torch

# Initialize the tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define batch size
batch_size = 32

# Function to tokenize in batches
def tokenize_in_batches(data, max_length=512):
    inputs = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        batch_inputs = bert_tokenizer(batch,
                                      return_tensors="pt",
                                      max_length=max_length,
                                      padding="max_length",
                                      truncation=True)
        inputs.append(batch_inputs)
    return inputs


bert_inputs_train = tokenize_in_batches(list(train_df1['Document']))
bert_inputs_test = tokenize_in_batches(list(test_df1['Document']))

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from transformers import T5Tokenizer

# Initialize the tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Define batch size
batch_size = 32

# Function to tokenize in batches
def tokenize_labels_in_batches(data, max_length=128):
    labels = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        batch_labels = t5_tokenizer(batch,
                                    return_tensors="pt",
                                    max_length=max_length,
                                    padding="max_length",
                                    truncation=True)
        labels.append(batch_labels)
    return labels

# Tokenize training and test summary labels in batches
t5_labels_train = tokenize_labels_in_batches(list(train_df1['Summary']))
t5_labels_test = tokenize_labels_in_batches(list(test_df1['Summary']))

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def tokenize_in_aligned_batches(bert_data, t5_data, batch_size=16, max_length=512, t5_max_length=128):
    bert_inputs = []
    t5_labels = []

    for i in range(0, len(bert_data), batch_size):
        bert_batch = bert_data[i:i+batch_size]
        t5_batch = t5_data[i:i+batch_size]

        bert_batch_inputs = bert_tokenizer(bert_batch,
                                           return_tensors="pt",
                                           max_length=max_length,
                                           padding="max_length",
                                           truncation=True)

        t5_batch_labels = t5_tokenizer(t5_batch,
                                       return_tensors="pt",
                                       max_length=t5_max_length,
                                       padding="max_length",
                                       truncation=True)

        bert_inputs.append(bert_batch_inputs)
        t5_labels.append(t5_batch_labels)

    return bert_inputs, t5_labels

# Tokenize data in aligned batches
bert_inputs_train, t5_labels_train = tokenize_in_aligned_batches(list(train_df1['Document']), list(train_df1['Summary']))
bert_inputs_test, t5_labels_test = tokenize_in_aligned_batches(list(test_df1['Document']), list(test_df1['Summary']))

class SummarizationDataset(torch.utils.data.Dataset):
    def __init__(self, bert_inputs, t5_labels):
        self.bert_inputs = bert_inputs
        self.t5_labels = t5_labels

    def __getitem__(self, idx):
        batch_idx = idx // len(self.bert_inputs[0]['input_ids'])
        item_idx = idx % len(self.bert_inputs[0]['input_ids'])

        return {
            'bert_input_ids': self.bert_inputs[batch_idx]['input_ids'][item_idx],
            'bert_attention_mask': self.bert_inputs[batch_idx]['attention_mask'][item_idx],
            't5_labels': self.t5_labels[batch_idx]['input_ids'][item_idx]
        }

    def __len__(self):
        total_items = 0
        for batch in self.bert_inputs:
            total_items += len(batch['input_ids'])
        return total_items

# Instantiate the dataset and dataloader
train_dataset = SummarizationDataset(bert_inputs_train, t5_labels_train)
batch_size = 4
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = SummarizationDataset(bert_inputs_test, t5_labels_test)
batch_size = 4
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


In [None]:
# Instantiate the dataset and dataloader
train_dataset = SummarizationDataset(bert_inputs_train, t5_labels_train)
batch_size = 4
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = SummarizationDataset(bert_inputs_test, t5_labels_test)
batch_size = 4
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


In [None]:
# prompt: save the test and train_data_loader using pickle

import pickle

with open('/content/drive/MyDrive/Text Summarization/train_data_loader.pkl', 'wb') as f:
  pickle.dump(train_data_loader, f)

with open('/content/drive/MyDrive/Text Summarization/test_data_loader.pkl', 'wb') as f:
  pickle.dump(test_data_loader, f)


In [None]:
# prompt: load the train_data_loader and test_data_loader

import pickle

with open('/content/drive/MyDrive/Text Summarization/train_data_loader1.pkl', 'rb') as f:
  train_data_loader = pickle.load(f)

with open('/content/drive/MyDrive/Text Summarization/test_data_loader2.pkl', 'rb') as f:
  test_data_loader = pickle.load(f)


In [None]:
# Training loop
for epoch in range(5):
    hybrid_model.train()
    total_loss = 0

    for batch in train_data_loader:
        input_ids = batch["bert_input_ids"].to(device)
        attention_mask = batch["bert_attention_mask"].to(device)
        labels = batch["t5_labels"].to(device)

        optimizer.zero_grad()

        # Use T5 to generate logits instead of token IDs
        bert_outputs = hybrid_model.bert(input_ids, attention_mask=attention_mask, output_attentions=True)
        key_sentences = hybrid_model.extract_key_sentences(bert_outputs, input_ids)

        t5_inputs = hybrid_model.t5_tokenizer(key_sentences, return_tensors="pt", text_target="dummy").to(input_ids.device)
        t5_input_ids = t5_inputs['input_ids']
        t5_attention_mask = t5_inputs['attention_mask']

        # Use teacher forcing to align outputs and labels
        t5_outputs = hybrid_model.t5(t5_input_ids, attention_mask=t5_attention_mask, labels=t5_input_ids)

        # Get logits
        logits = t5_outputs.logits

        # Reshape logits and labels for CrossEntropyLoss
        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1)

        # Ensure batch sizes match
        assert logits.size(0) == labels.size(0), "Batch sizes must match"

        loss = torch.nn.CrossEntropyLoss()(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_data_loader)}")


AssertionError: Batch sizes must match

In [None]:
def calculate_accuracy(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return correct / total

In [None]:
# Calculate training accuracy
    hybrid_model.eval()
    train_accuracy = calculate_accuracy(hybrid_model, data_loader, device)

    # Calculate test accuracy
    test_accuracy = calculate_accuracy(hybrid_model, test_data_loader, device)

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}")
    print(f"Epoch {epoch+1}, Train Accuracy: {train_accuracy}")
    print(f"Epoch {epoch+1}, Test Accuracy: {test_accuracy}")