# Summerize the text

In [None]:
#for using huggingface datasets
!pip install datasets

In [None]:
# import dependencies

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

## Dataset Summary
DialogSum is a large-scale dialogue summarization dataset, consisting of 13,460 (Plus 100 holdout data for topic generation) dialogues with corresponding manually labeled summaries and topics.

In [None]:
# load dataset of huggingface dialouge and base human summary

huggfaceDataset = "knkarthick/dialogsum"
dataset = load_dataset(huggfaceDataset)

##  check dialogsum dataset

In [None]:

# name of columns
dataset.column_names

In [None]:
# number of rows
dataset.num_rows

In [None]:
# number of columns
dataset.num_columns

In [None]:
dataset['train']

In [None]:
# check sample of 999 of dataset
print(dataset['train'][999]['dialogue'])

In [None]:
print(dataset['train'][999]['summary'])

In [None]:
dataset['train'][999]['topic']

In [None]:
# sample of 500 and 1200
example_indices = [500, 1200]

dash_line = '*'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1 ,'\n')
    print(dash_line)
    print('INPUT DIALOGUE:','\n')
    print(dataset['test'][index]['dialogue'],'\n')
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:','\n')
    print(dataset['test'][index]['summary'],'\n')
    print(dash_line)
    print()

# [FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)

## Overview
FLAN-T5 was released in the paper Scaling Instruction-Finetuned Language Models - it is an enhanced version of T5 that has been finetuned in a mixture of tasks.

An example of FLAN-T5 from huggingface

The goal of this code is to use a pre-trained sequence-to-sequence language model to generate text — specifically, to complete or continue a given text prompt as a helpful assistant

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

inputs = tokenizer("A step by step recipe to make bolognese pasta:" ,return_tensors = 'pt')
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

### now step by step explain how to use this model to summerize.



In [None]:
# load the model flan-t5 base


model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Download the tokenizer for the **FLAN-T5** model using `AutoTokenizer.from_pretrained()` method. Parameter `use_fast` switches on fast tokenizer. At this stage, there is no need to go into the details of that, but you can find the tokenizer parameters in the [documentation](https://huggingface.co/docs/transformers/v4.28.1/en/model_doc/auto#transformers.AutoTokenizer).

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

`Text Input → Tokenization → Numerical Encoding → Vector Representation → Decoding → Text Output`


## Detailed Technical Breakdown

1. Text Acquisition & Preprocessing

Input Reception: Capture raw textual input from source (user query, document, API request)

Normalization: Standardize casing, remove extraneous whitespace, handle special characters

Sanitization: Filter inappropriate content, validate input boundaries

2. Tokenization Phase

Segmentation: Divide continuous text into discrete linguistic units (tokens)

Methodology: Employ subword tokenization (e.g., WordPiece, Byte-Pair Encoding)

Special Tokens: Insert control tokens ([CLS], [SEP], [PAD]) for model-specific processing

3. Numerical Encoding

Vocabulary Mapping: Convert each token to corresponding integer ID from pretrained vocabulary

Vector Creation: Generate tensor representation for batch processing

Attention Masks: Create binary masks distinguishing actual tokens from padding

4. Model Processing (Vector Operations)

Embedding Lookup: Convert token IDs to dense vector representations

Neural Transformation: Apply transformer architecture (self-attention, feed-forward layers)

Contextualization: Generate context-aware representations via multi-head attention

5. Decoding & Text Reconstruction

Token Generation: Produce output token IDs through autoregressive sampling

Detokenization: Map numerical IDs back to string tokens

Post-processing: Remove special tokens, reconstruct original formatting

In [None]:
# sentence
txt = "Test for encoding and decoding ?"

#tokenize the sentence
txt_encoded = tokenizer(txt , return_tensors='pt')

#decode the sentence
txt_decoded = tokenizer.decode(txt_encoded["input_ids"][0] ,
                               skip_special_tokens=True)

print('encoded text:')
print(txt_encoded['input_ids'][0])
print('\ntxt_decoded:')
print(txt_decoded)



In [None]:
txt_encoded

In [None]:
# With 3 sentences in a batch
sentences = ["What time is it?", "Hello world", "How are you?"]
batch_encoded = tokenizer(sentences, return_tensors='pt', padding=True)

print(batch_encoded["input_ids"])
# tensor([[  101,  2054,  2051,  2003,  2009,  1029,   102,     0,     0],
#         [  101,  7592,  2088,   102,     0,     0,     0,     0,     0],
#         [  101,  2129,  2024,  2017,  1029,   102,     0,     0,     0]])

# Access each sentence separately:
print(batch_encoded["input_ids"][0])  # First sentence token IDs
print(batch_encoded["input_ids"][1])  # Second sentence token IDs
print(batch_encoded["input_ids"][2])  # Third sentence token IDs

## It's time to explore how well the base LLM summarize dilogues without any prompt enginnering.

In [None]:
example_indices = [500, 1200]

for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line,'\n')
    print('Example ', i + 1,'\n')
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}','\n')
    print(dash_line,'\n')
    print(f'BASELINE HUMAN SUMMARY:\n{summary}','\n')
    print(dash_line,'\n')
    print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n','\n')

## Prompt engineering

In [None]:
for i, index in enumerate(example_indices):

  dialogue = dataset['test'][index]['dialogue']

  summary = dataset['test'][index]['summary']


  prompt = f"""

  Summarize the following conversation.

  {dialogue}

  Summary:
  """


  inputs = tokenizer(prompt, return_tensors='pt')
  outputs = tokenizer.decode(model.generate(inputs["input_ids"] ,
                                            max_new_tokens =50,)[0],
                             skip_special_tokens=True)
  print(dash_line,'\n')
  print('Example ', i + 1,'\n')
  print(dash_line,'\n')
  print(f'INPUT PROMPT:\n{prompt}','\n')
  print(dash_line,'\n')
  print(f'BASELINE HUMAN SUMMARY:\n{summary}','\n')
  print(dash_line,'\n')
  print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n','\n')


# Zero Shot Inference with the Prompt Template from FLAN-T5

In [None]:
for i ,index in enumerate(example_indices):

  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']


  prompt = f"""
  Dialogue:
  {dialogue}
  what was going on?
  """

  inputs = tokenizer(prompt, return_tensors='pt')

  outputs = tokenizer.decode(model.generate(inputs["input_ids"],
                                            max_new_tokens=50)[0],
                             skip_special_tokens = True)


  print(dash_line , '\n')
  print('Example ', i+1, '\n')
  print(dash_line, '\n')
  print(f'input prompt :\n{prompt}')
  print(dash_line, '\n')
  print(f'base line human summary:\n{summary}\n')
  print(dash_line, '\n')
  print(f'model generation - zero shot: \n{outputs}\n')





# 4 -  One Shot Inference with the Prompt Template from FLAN-T5

In [None]:
def build_prompt(example_indices_full, example_index_to_summarize):

  prompt=''

  for index in example_indices_full:
      dialogue = dataset['test'][index]['dialogue']
      summary = dataset['test'][index]['summary']

      prompt += f"""

Dialogue:
{dialogue}

what was going on?
{summary}

      """

      dialogue = dataset['test'][example_index_to_summarize]['dialogue']

      prompt += f"""

Dialogue:
{dialogue}

what was going on?
"""
      return prompt


In [None]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = build_prompt(example_indices_full, example_index_to_summarize)
print(f'one shot prompt: {one_shot_prompt}')

In [None]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
outputs = tokenizer.decode(model.generate(inputs['input_ids'],max_new_tokens=50,)[0],
                           skip_special_tokens=True)


print(dash_line ,'\n')
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n','\n')
print(dash_line,'\n')
print(f'MODEL GENERATION - ONE SHOT:\n{output}','\n')

# Few Shot Inference with the Prompt Template from FLAN-T5

In [None]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = build_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)

In [None]:

summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens=True
)

print(dash_line,'\n')
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n','\n')
print(dash_line,'\n')
print(f'MODEL GENERATION - FEW SHOT:\n{output}','\n')