# Text Summarization using Huggingface

### Loading Dependencies

In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch
nltk.download('punkt')
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 
from datasets import load_dataset, load_from_disk, load_metric

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aveshverma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
model = 'google/pegasus-cnn_dailymail'
tokenizer = AutoTokenizer.from_pretrained(model)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model).to("cpu")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1. Training of the model -- Finetuning
2. Inferencing - Loading pretrained model, then prediction after fine-tuning

### Loading Dataset

In [8]:
dataset = load_dataset('samsum')

- Samsum is a dataset made specifically for summarization tasks.

In [None]:
dataset

In [None]:
for i in range(0,5):
    print(f"{i+1}: Dialogue: ")
    print(dataset["train"]["dialogue"][i])
    print("--------------")
    print("Summary: ")
    print(dataset["train"]["summary"][i])
    print("----------------------------------------------------------------")


In [None]:
for split in dataset:
    print(split)

In [None]:
split_lengths = [len(dataset[split]) for split in dataset]
split_lengths

In [None]:
dataset["train"].column_names

### Text Preprocessing

In [None]:
def convert_examples_to_features(data_in_batch):
    input_encoding = tokenizer(data_in_batch["dialogue"], max_length=1024, truncation=True) #Truncation to match the max_length
    target_encoding = tokenizer(data_in_batch["summary"], max_length=128, truncation=True)
    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": target_encoding["input_ids"]
    }

Now we need to map the entire data using our function.

In [None]:
dataset_en = dataset.map(convert_examples_to_features, batched=True)

In [None]:
dataset_en["train"]

In [None]:
example_embedding = dataset_en["train"]["input_ids"][1] #Embedding with respect to the first dialogue
example_embedding

In [None]:
len(example_embedding)

Now we can create the embedding vector that can be directly consumed for Text Generation, RAG etc.

In [None]:
example_attn_mask = dataset_en["train"]["attention_mask"][1] #Embedding with respect to the first dialogue
example_attn_mask

Attention Mask is a binary mask that indicates which token in sequence should be attended to, and which should be ignored. Used during the self-attention mechanism to control the flow of information between different positions in the input sequence.

##### Finetuning

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq


In [None]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_pegasus)

In [None]:
training_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
    )

trainer = Trainer(
    model = model_pegasus, args = training_args, tokenizer= tokenizer, data_collator=seq2seq_data_collator,
    train_dataset=dataset_en["test"],
    eval_dataset=dataset_en["validation"]
    )

In [None]:
trainer.train()