In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/PS2/Train.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119924 entries, 0 to 119923
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   article   117232 non-null  object
 1   abstract  119924 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB
None


In [None]:
import re

df = df.dropna()

df = df.sample(frac=0.1, random_state=42)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 11723 entries, 74429 to 119693
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   article   11723 non-null  object
 1   abstract  11723 non-null  object
dtypes: object(2)
memory usage: 274.8+ KB
None


In [None]:
!pip install transformers



In [None]:
import torch.nn as nn
from transformers import BertTokenizer, BertModel, AutoTokenizer
import re

class DocumentProcessor:
    def __init__(self, tokenizer_name="allenai/scibert_scivocab_uncased"):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def process_text(self, text):
      text = text.strip()  # Keep existing whitespace removal
      text = text.replace('\n', '.')
      return text

    def process_documents(self, documents):
        return [self.process_text(doc) for doc in documents]

    def process_file(self, file_path):
        """Process a single text file."""
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            return self.process_text(text)

    def process_files(self, file_paths):
        """Process multiple text files."""
        return [self.process_file(file_path) for file_path in file_paths]

    def load_from_dataframe(self, df, text_column):
        return df[text_column].tolist()

class Chunker:
    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

    def chunk_text(self, text):
        """Split text into overlapping chunks."""
        tokens = self.tokenizer.tokenize(text)
        chunks = []

        start = 0
        while start < len(tokens):
            end = min(start + self.chunk_size, len(tokens))
            chunk_tokens = tokens[start:end]
            chunk_text = self.tokenizer.convert_tokens_to_string(chunk_tokens)
            chunks.append(chunk_text)

            # Move start pointer with overlap
            start += self.chunk_size - self.chunk_overlap

        return chunks

In [None]:
processor = DocumentProcessor()
processed_text = processor.process_text("A well-organized paragraph supports or develops a single controlling idea, which is expressed in a sentence called the topic sentence. A topic sentence has several important functions: it substantiates or supports an essay’s thesis statement; it unifies the content of a paragraph and directs the order of the sentences; and it advises the reader of the subject to be discussed and how the paragraph will discuss it. Readers generally look to the first few sentences in a paragraph to determine the subject and perspective of the paragraph. That’s why it’s often best to put the topic sentence at the very beginning of the paragraph. In some cases, however, it’s more effective to place another sentence before the topic sentence—for example, a sentence linking the current paragraph to the previous one, or one providing background information.")
print(processed_text)  # Output: this is an example with unnecessary characters

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

A well-organized paragraph supports or develops a single controlling idea, which is expressed in a sentence called the topic sentence. A topic sentence has several important functions: it substantiates or supports an essay’s thesis statement; it unifies the content of a paragraph and directs the order of the sentences; and it advises the reader of the subject to be discussed and how the paragraph will discuss it. Readers generally look to the first few sentences in a paragraph to determine the subject and perspective of the paragraph. That’s why it’s often best to put the topic sentence at the very beginning of the paragraph. In some cases, however, it’s more effective to place another sentence before the topic sentence—for example, a sentence linking the current paragraph to the previous one, or one providing background information.


In [None]:
chunker = Chunker(chunk_size=100, chunk_overlap=10)  # Adjust chunk size and overlap
chunks = chunker.chunk_text(processed_text)

# Print the chunks to verify (for demonstration)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1}: {chunk}")

Chunk 1: a well - organized paragraph supports or develops a single controlling idea, which is expressed in a sentence called the topic sentence. a topic sentence has several important functions : it substantiates or supports an essay ’ s thesis statement ; it unifies the content of a paragraph and directs the order of the sentences ; and it advises the reader of the subject to be discussed and how the paragraph will discuss it. readers generally look to the first few sentences in a paragraph to determine the subject and perspective of
Chunk 2: in a paragraph to determine the subject and perspective of the paragraph. that ’ s why it ’ s often best to put the topic sentence at the very beginning of the paragraph. in some cases, however, it ’ s more effective to place another sentence before the topic sentence — for example, a sentence linking the current paragraph to the previous one, or one providing background information.


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

class ExtractiveSummarizer(nn.Module):  # Renamed class
    def __init__(self, model_name="allenai/scibert_scivocab_uncased", max_length=512):
        super(ExtractiveSummarizer, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)  # Tokenizer is now part of the class
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.max_length = max_length

        # Classification layer for sentence importance
        self.classifier = nn.Linear(self.model.config.hidden_size, 1).to(self.device)

        # Section embedding layer (inspired by SciBERTSUM)
        self.section_embeddings = nn.Embedding(10, self.model.config.hidden_size).to(self.device)

    def forward(self, text, section_ids=None):  # Modified forward method
        # Tokenize the input text
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=self.max_length)

        # Move inputs to the device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Get model outputs
        outputs = self.model(**inputs)
        sequence_output = outputs.last_hidden_state

        # Add section embeddings if provided
        if section_ids is not None:
            section_embeddings = self.section_embeddings(section_ids)
            sequence_output = sequence_output + section_embeddings.unsqueeze(1).expand_as(sequence_output)

        # Use [CLS] representation for classification
        sentence_vectors = sequence_output[:, 0, :]
        scores = self.classifier(sentence_vectors)
        return scores


class SentenceCombiner:
    def __init__(self, model_name="allenai/scibert_scivocab_uncased"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def get_sentence_embeddings(self, sentences):
        """Get SciBERT embeddings for a list of sentences."""
        inputs = self.tokenizer(sentences, return_tensors="pt", padding=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embedding
        return embeddings.cpu().numpy()  # Convert to NumPy array

    def remove_redundant(self, sentences, similarity_threshold=0.8):
        """Remove redundant sentences based on similarity."""
        if not sentences:
            return []

        # Get embeddings and compute similarity matrix
        embeddings = self.get_sentence_embeddings(sentences)
        sim_matrix = cosine_similarity(embeddings)

        # Track which sentences to keep
        to_keep = [True] * len(sentences)

        # Identify redundant sentences
        for i in range(len(sentences)):
            if not to_keep[i]:
                continue

            for j in range(i+1, len(sentences)):
                if to_keep[j] and sim_matrix[i, j] > similarity_threshold:
                    to_keep[j] = False

        # Return non-redundant sentences
        return [s for i, s in enumerate(sentences) if to_keep[i]]

In [None]:
large_sentence = "Summer is a season of warmth, sunshine, and vibrant energy. It is a time when nature is at its peak, with blossoming flowers, lush greenery, and longer days. The summer season typically spans from June to August in the Northern Hemisphere, bringing with it a sense of excitement and leisure. One of the defining characteristics of summer is the warm and pleasant weather. The sun shines brightly, filling the days with abundant light and providing the perfect conditions for outdoor activities. People flock to parks, beaches, and outdoor spaces to bask in the sun, soak up the vitamin D, and engage in various recreational pursuits. Summer is also a time for vacations and travel. Families plan trips to destinations near and far, exploring new places and creating lasting memories. It is a chance to break away from the routines of daily life and experience new cultures, cuisines, and landscapes. Whether it’s a beachside getaway, a mountain retreat, or a city adventure, summer vacations offer a much-needed escape and rejuvenation. In addition to leisure and travel, summer is a season of celebration. Festivals and events fill the air with joy and merriment. From lively music festivals to colorful cultural celebrations, summer brings people together in the spirit of unity and enjoyment. It is a time for outdoor concerts, food fairs, and fireworks, igniting a sense of community and creating a festive ambiance. While summer brings joy and excitement, it also presents its challenges. The high temperatures and intense heat can be uncomfortable and potentially harmful. It is important to stay hydrated, seek shade, and protect oneself from the sun’s rays. Sunscreen, hats, and lightweight clothing become essential accessories for sun protection. In conclusion, the summer season is a time of vibrant energy, leisure, and celebration. It offers an opportunity to enjoy the warmth of the sun, explore new destinations, and create cherished memories. While the heat may require some precautions, summer is a season to embrace the outdoors, connect with nature, and savor the joys of life."

# Split the large sentence into smaller sentences using a basic approach (you might need more sophisticated sentence segmentation)
sentences = processor.process_text(large_sentence)
sentences = chunker.chunk_text(sentences)
print(sentences)
# 1. Use ExtractiveSummarizer to get sentence scores:
summarizer = ExtractiveSummarizer()
scores = summarizer(sentences)  # Assuming summarizer can handle a list of sentences
print(scores)

# 2. Process scores to select important sentences (example logic):
# (In a real scenario, you would likely use more advanced selection techniques)
top_sentences = [sentences[i] for i in torch.topk(scores.squeeze(), k=3).indices.tolist()]  # Select top 2 sentences

# 3. Use SentenceCombiner to remove redundancy (if needed):
combiner = SentenceCombiner()
final_summary = combiner.remove_redundant(top_sentences, similarity_threshold=0.8)

# Print the final summary:
print("Final Summary:", " ".join(final_summary))

['summer is a season of warmth, sunshine, and vibrant energy. it is a time when nature is at its peak, with blossoming flowers, lush greenery, and longer days. the summer season typically spans from june to august in the northern hemisphere, bringing with it a sense of excitement and leisure. one of the defining characteristics of summer is the warm and pleasant weather. the sun shines brightly, filling the days with abundant light and providing the perfect conditions', 'the days with abundant light and providing the perfect conditions for outdoor activities. people flock to parks, beaches, and outdoor spaces to bask in the sun, soak up the vitamin d, and engage in various recreational pursuits. summer is also a time for vacations and travel. families plan trips to destinations near and far, exploring new places and creating lasting memories. it is a chance to break away from the routines of daily life and experience new cultures, cuisines, and landscapes', 'experience new cultures, cu

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

tensor([[-0.0211],
        [ 0.2560],
        [ 0.1830],
        [ 0.0704],
        [-0.0927],
        [ 0.1044]], device='cuda:0', grad_fn=<AddmmBackward0>)


model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Final Summary: the days with abundant light and providing the perfect conditions for outdoor activities. people flock to parks, beaches, and outdoor spaces to bask in the sun, soak up the vitamin d, and engage in various recreational pursuits. summer is also a time for vacations and travel. families plan trips to destinations near and far, exploring new places and creating lasting memories. it is a chance to break away from the routines of daily life and experience new cultures, cuisines, and landscapes , and savor the joys of life.


In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# import torch

# class AbstractiveSummarizer:
#     def __init__(self, model_name="t5-base", max_length=5000, min_length = 1000):
#         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
#         self.max_length = max_length

#     def summarize(self, text, max_length=1000, min_length=40):
#         """Generate an abstractive summary of the text."""
#         inputs = self.tokenizer("summarize: " + text, return_tensors="pt",
#                                max_length=self.max_length, truncation=True).to(self.device)

#         summary_ids = self.model.generate(
#             inputs["input_ids"],
#             max_length=max_length,
#             min_length=min_length,
#             num_beams=4,
#             no_repeat_ngram_size=2,
#             early_stopping=True
#         )

#         summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#         return summary


import torch.nn as nn # Import nn module
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

class AbstractiveSummarizer(nn.Module): # Inherit from nn.Module
    def __init__(self, model_name="t5-base", max_length=5000, min_length = 1000):
        super(AbstractiveSummarizer, self).__init__() # Call super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
        self.max_length = max_length

    def summarize(self, text, max_length=1000, min_length=40):
        """Generate an abstractive summary of the text."""
        inputs = self.tokenizer("summarize: " + text, return_tensors="pt",
                               max_length=self.max_length, truncation=True).to(self.device)

        summary_ids = self.model.generate(
            inputs["input_ids"],
            max_length=max_length,
            min_length=min_length,
            num_beams=4,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

# Example usage:
text = "Summer is a season of warmth, sunshine, and vibrant energy. It is a time when nature is at its peak, with blossoming flowers, lush greenery, and longer days. The summer season typically spans from June to August in the Northern Hemisphere, bringing with it a sense of excitement and leisure. One of the defining characteristics of summer is the warm and pleasant weather. The sun shines brightly, filling the days with abundant light and providing the perfect conditions for outdoor activities. People flock to parks, beaches, and outdoor spaces to bask in the sun, soak up the vitamin D, and engage in various recreational pursuits. Summer is also a time for vacations and travel. Families plan trips to destinations near and far, exploring new places and creating lasting memories. It is a chance to break away from the routines of daily life and experience new cultures, cuisines, and landscapes. Whether it’s a beachside getaway, a mountain retreat, or a city adventure, summer vacations offer a much-needed escape and rejuvenation. In addition to leisure and travel, summer is a season of celebration. Festivals and events fill the air with joy and merriment. From lively music festivals to colorful cultural celebrations, summer brings people together in the spirit of unity and enjoyment. It is a time for outdoor concerts, food fairs, and fireworks, igniting a sense of community and creating a festive ambiance. While summer brings joy and excitement, it also presents its challenges. The high temperatures and intense heat can be uncomfortable and potentially harmful. It is important to stay hydrated, seek shade, and protect oneself from the sun’s rays. Sunscreen, hats, and lightweight clothing become essential accessories for sun protection. In conclusion, the summer season is a time of vibrant energy, leisure, and celebration. It offers an opportunity to enjoy the warmth of the sun, explore new destinations, and create cherished memories. While the heat may require some precautions, summer is a season to embrace the outdoors, connect with nature, and savor the joys of life."

summarizer = AbstractiveSummarizer()
summary = summarizer.summarize(" ".join(final_summary))

print(summary)  # To see the output, run the code.

people flock to parks, beaches, and outdoor spaces to bask in the sun . summer is also a time for vacations and travel. families plan trips to destinations near and far.


In [None]:
class HybridSummarizer:
    def __init__(self,
                 extractive_model_name="allenai/scibert_scivocab_uncased",
                 abstractive_model_name="t5-base",
                 chunk_size=100,
                 chunk_overlap=20,
                 top_k_sentences=3,
                 similarity_threshold=0.8,
                 max_length=5000):

        # Initialize your existing classes
        self.processor = DocumentProcessor(tokenizer_name=extractive_model_name)
        self.chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        self.extractive_summarizer = ExtractiveSummarizer(model_name=extractive_model_name, max_length=max_length)
        self.sentence_combiner = SentenceCombiner(model_name=extractive_model_name)
        self.abstractive_summarizer = AbstractiveSummarizer(model_name=abstractive_model_name, max_length=max_length)

        # Hyperparameters
        self.top_k_sentences = top_k_sentences
        self.similarity_threshold = similarity_threshold


    def to(self, device):
        """Move the internal models to the specified device."""
        self.extractive_summarizer.to(device)
        self.abstractive_summarizer.to(device)
        self.sentence_combiner.model.to(device) # Move the model in SentenceCombiner
        return self  # Return self for chaining

    def generate_extractive_summary(self, text):
        """Generate an extractive summary (for the dataset)."""
        processed_text = self.processor.process_text(text)
        chunks = self.chunker.chunk_text(processed_text)

        scores = self.extractive_summarizer(chunks)

        # Check if scores is 0-dimensional (only one chunk)
        if scores.numel() == 1:
            top_sentence_indices = [0]  # Just take the first (and only) chunk
        else:
            # Ensure k does not exceed the number of chunks
            k = min(self.top_k_sentences, scores.numel())
            top_sentence_indices = torch.topk(scores.squeeze(), k=k).indices.tolist()

        top_sentences = [chunks[i] for i in top_sentence_indices]

        # Optionally, use self.sentence_combiner.remove_redundant() here
        return " ".join(top_sentences)

    def generate_summary(self, text):
        """Generate the final summary using a hybrid approach."""

        # 1. Preprocessing
        processed_text = self.processor.process_text(text)

        # 2. Chunking
        chunks = self.chunker.chunk_text(processed_text)
        print(chunks)

        # 3. Extractive Summarization
        scores = self.extractive_summarizer(chunks)
        print(scores)
        print(scores.numel())

        # Check if scores is 0-dimensional (only one chunk)
        if scores.numel() == 1:
            top_sentence_indices = [0]  # Just take the first (and only) chunk
        else:
            # Ensure k does not exceed the number of chunks
            k = min(self.top_k_sentences, scores.numel())
            top_sentence_indices = torch.topk(scores.squeeze(), k=k).indices.tolist()

        top_sentences = [chunks[i] for i in top_sentence_indices]

        # 4. Redundancy Removal
        filtered_sentences = self.sentence_combiner.remove_redundant(top_sentences, similarity_threshold=self.similarity_threshold)

        # 5. Abstractive Summarization
        final_summary = self.abstractive_summarizer.summarize(" ".join(filtered_sentences))

        return final_summary

    def generate_summary_file(self, file):
        """Generate the final summary using a hybrid approach."""

        # 1. Preprocessing
        processed_text = self.processor.process_file(file)
        print(processed_text)
        # 2. Chunking
        chunks = self.chunker.chunk_text(processed_text)
        print(chunks)

        # 3. Extractive Summarization
        scores = self.extractive_summarizer(chunks)
        print(scores)
        print(scores.numel())

        # Check if scores is 0-dimensional (only one chunk)
        if scores.numel() == 1:
            top_sentence_indices = [0]  # Just take the first (and only) chunk
        else:
            # Ensure k does not exceed the number of chunks
            k = min(self.top_k_sentences, scores.numel())
            top_sentence_indices = torch.topk(scores.squeeze(), k=k).indices.tolist()

        top_sentences = [chunks[i] for i in top_sentence_indices]

        # 4. Redundancy Removal
        filtered_sentences = self.sentence_combiner.remove_redundant(top_sentences, similarity_threshold=self.similarity_threshold)

        print(filtered_sentences)
        # 5. Abstractive Summarization
        final_summary = self.abstractive_summarizer.summarize(" ".join(filtered_sentences))

        return final_summary

In [None]:
summarizer = HybridSummarizer(chunk_size=150, chunk_overlap=30, top_k_sentences=15)

# Generate a summary
text = "This is a long and detailed article about the latest advancements in artificial intelligence. It covers various topics such as machine learning, deep learning, natural language processing, and computer vision. Researchers are making significant progress in these areas, leading to innovative applications in diverse fields."
file = "/content/drive/MyDrive/Summarize.txt"
summary = summarizer.generate_summary_file(file)
print(summary)

According to Tech Jury, despite a number of cool apps and tips for successful time management, only 17% of people track their time. 50% of people have never thought about time waste, even though they are always late and running out of time. Time management is a skill. It helps people handle their daily duties without burnout and severe exhaustion. The N.I.L.C. includes time management on the list of top ten demanded soft skills that employees require in 2022. Why is it so important to manage one’s time correctly? Stephen Covey once said, “The key is not spending time, but in investing it”. It means that proper timing guarantees a person’s success in many life areas...Career Trend names three negative aspects that occur when a person is not able to follow a schedule and be flexible. First off, one risks delaying the task performance all the time. People who got used to procrastination start doing assignments and duties at the very last moment. As a result, they sacrifice quality for the

In [None]:
!pip install datasets
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

class HybridSummarizationDataset(Dataset):
    def __init__(self, dataset, summarizer):
        self.dataset = dataset
        self.summarizer = summarizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        # If 'article' is a list, join it into a single string
        article = item['article']
        if isinstance(article, list):
            article = ' '.join(article)

        extractive_summary = self.summarizer.generate_extractive_summary(article)
        return {
            "article": article,  # Use the joined article
            "extractive_summary": extractive_summary,
            "summary": item['summary'],
        }

# 5. Define Loss Function and Optimizer:
# Define a combined loss function for both extractive and abstractive parts
def combined_loss_function(extractive_scores, abstractive_loss):
    # ... (Your logic to combine the losses) ...
    # Example:
    return extractive_scores.mean() + abstractive_loss


# Example usage (replace with your data loading and training logic):
# Assuming you have a pandas DataFrame 'df' with 'article' and 'summary' columns
df = pd.read_csv('/content/drive/MyDrive/PS2/Train.csv') # Replace with your actual data loading
df = df.dropna()
df = df.sample(frac=0.1, random_state=42)
train_data = pd.DataFrame({"article": df['article'], "summary": df['abstract']})
train_dataset = Dataset.from_pandas(train_data)


# 2. Instantiate HybridSummarizer (same as before):
summarizer = HybridSummarizer(
    chunk_size=150,
    chunk_overlap=30,
    top_k_sentences=15,
)

# 4. Create Dataset and DataLoader:
train_dataset = HybridSummarizationDataset(train_dataset, summarizer)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)  # Adjust batch_size as needed


all_parameters = list(summarizer.extractive_summarizer.parameters()) + \
                 list(summarizer.abstractive_summarizer.parameters())

# Create the optimizer using the combined parameters
optimizer = AdamW(all_parameters, lr=1e-5)

# 6. Training Loop:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summarizer.to(device)  # Move the model to the device

for epoch in range(3):  # Adjust number of epochs as needed
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()

        # Extractive part
        chunks = [summarizer.chunker.chunk_text(summarizer.processor.process_text(text)) for text in batch['article']]
        extractive_scores = [summarizer.extractive_summarizer(chunk) for chunk in chunks]

        # Abstractive part
        inputs = ["summarize: " + summary for summary in batch['extractive_summary']]
        model_inputs = summarizer.abstractive_summarizer.tokenizer(inputs, max_length=512, truncation=True, padding=True, return_tensors="pt").to(device)
        with summarizer.abstractive_summarizer.tokenizer.as_target_tokenizer():
            labels = summarizer.abstractive_summarizer.tokenizer(batch['summary'], max_length=128, truncation=True, padding=True, return_tensors="pt").to(device)

        outputs = summarizer.abstractive_summarizer.model(**model_inputs, labels=labels["input_ids"])
        abstractive_loss = outputs.loss

        # Combined loss
        loss = combined_loss_function(torch.cat(extractive_scores), abstractive_loss)

        loss.backward()
        optimizer.step()


# 7. Save the Fine-tuned Model:
torch.save(summarizer.state_dict(), "./fine_tuned_hybrid_summarizer.pth")



OutOfMemoryError: CUDA out of memory. Tried to allocate 92.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 24.12 MiB is free. Process 4997 has 14.71 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 351.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)