In [1]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)


mps


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,GenerationConfig

In [3]:
dialog_dataset_name = 'knkarthick/dialogsum'

dialog_dataset = load_dataset(dialog_dataset_name)

print(type(dialog_dataset))

<class 'datasets.dataset_dict.DatasetDict'>


In [4]:
print(dialog_dataset.keys())

dict_keys(['train', 'validation', 'test'])


In [5]:
print(dialog_dataset.items())

dict_items([('train', Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})), ('validation', Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 500
})), ('test', Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 1500
}))])


In [6]:
print(dialog_dataset['train'][:2])

{'id': ['train_0', 'train_1'], 'dialogue': ["#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.", "#Person1#

In [7]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=True)


In [8]:
sentence = "Hello, how are you?"
encoded = tokenizer(sentence, return_tensors="pt")
print(encoded)

{'input_ids': tensor([[8774,    6,  149,   33,   25,   58,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [9]:
print(tokenizer.decode(encoded["input_ids"][0]))

Hello, how are you?</s>


In [10]:

example_indices = [40,200]
dash_line = "-"*80
for i, index in enumerate(example_indices):
    dialogue = dialog_dataset['test'][index]['dialogue']
    summary = dialog_dataset['test'][index]['summary']
    
    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n')
    

--------------------------------------------------------------------------------
Example  1
--------------------------------------------------------------------------------
INPUT PROMPT:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
--------------------------------------------------------------------------------
MODEL GENERATION - WITHOUT PROMPT ENGINEERING:
Person1: It's ten to nine.

--------------------------------------------------------------------------------
Example  2
---------------

In [11]:
for i,index in enumerate(example_indices):
    dialogue = dialog_dataset['test'][index]['dialogue']
    summary = dialog_dataset['test'][index]['summary']

    prompt = f"""
    Dialogue:

    {dialogue}

    What was going on?
    """
    
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n')

--------------------------------------------------------------------------------
Example  1
--------------------------------------------------------------------------------
INPUT PROMPT:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
--------------------------------------------------------------------------------
MODEL GENERATION - WITHOUT PROMPT ENGINEERING:
Tom is late for the train.

--------------------------------------------------------------------------------
Example  2
---------------

In [12]:
def make_prompt(example_indices, index_to_summarize):
    prompt = ''
    for index in example_indices:
        dialogue = dialog_dataset['test'][index]['dialogue']
        summary = dialog_dataset['test'][index]['summary']
    

        prompt += f"""
        Dialogue:

        {dialogue}
        
        Summary:

        {summary}
        """
    dialogue_to_summarize = dialog_dataset['test'][index_to_summarize]['dialogue']
    prompt += f"""
    Dialogue to summarize:

    {dialogue_to_summarize}

    what is going on?
    """
    
    return prompt


        

In [13]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)


        Dialogue:

        #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

        Summary:

        #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
        
    Dialogue to summarize:

    #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need 

In [14]:
summary = dialog_dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}')

--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

--------------------------------------------------------------------------------
MODEL GENERATION - ONE SHOT:
#Person1 wants to upgrade his computer. #Person2 wants to add a painting program to his software. #Person1 wants to add a CD-ROM drive.


In [15]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)


        Dialogue:

        #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

        Summary:

        #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
        
        Dialogue:

        #Person1#: May, do you mind helping me prepare for the picnic?
#Person2#: Sure. Have you checked the weather report?
#Person1#: Yes. It says it will be sunny all day. No sign of rain at all. This is your father's favorite sausage. Sandwiches for you and Daniel.
#Person2#: No, thanks Mom. I'd like some toast and chicken wings.
#Person1#: Okay. Please take some fruit salad and crackers for me.
#Person2#: Done. Oh, don't forget to take napkins disposable plates, 

In [16]:
summary = dialog_dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

Token indices sequence length is longer than the specified maximum sequence length for this model (812 > 512). Running this sequence through the model will result in indexing errors


--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

--------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1 wants to upgrade his computer. #Person2 wants to add a painting program to his software. #Person1 wants to add a CD-ROM drive.


In [33]:
generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.5,top_p=0.7)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')



--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

--------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1: You might consider upgrading your system. #Person2: You might also consider adding a painting program to your software. #Person1: You might also consider adding a faster processor, more memory and a
