In [1]:
!pip install --upgrade pip
!pip install --disable-pip-version-check torch torchdata --quiet
!pip install transformers==4.27.2 datasets==2.11.0 --quiet


[0m

In [9]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

In [10]:
huggingface_dataset_name="knkarthick/dialogsum"
dataset=load_dataset(huggingface_dataset_name)

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading and preparing dataset csv/knkarthick--dialogsum to /Users/bhavishachaudhari/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/bhavishachaudhari/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
example_indices=[40,200]
dash_line='-'.join('' for x in range(100))
for i,index in enumerate(example_indices):
    print(dash_line)
    print('Example ',i+1)
    print(dash_line)
    print('INPUT DIALOGUE: ')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Ex

In [14]:
model_name='google/flan-t5-base'
model=AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
tokenizer=AutoTokenizer.from_pretrained(model_name ,use_fast=True)

Downloading tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [18]:
sentence="What time is it,Tom?"
sentence_encoded = tokenizer(sentence,return_tensors='pt')
sentence_decoded = tokenizer.decode(sentence_encoded["input_ids"][0],skip_special_tokens=True)
print("ENCODED SENTENCE: ")
print(sentence_encoded["input_ids"][0])
print("\nDECODED SENTENCE: ")
print(sentence_decoded)

ENCODED SENTENCE: 
tensor([ 363,   97,   19,   34,    6, 3696,   51,   58,    1])

DECODED SENTENCE: 
What time is it,Tom?


In [22]:
for i,index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary= dataset['test'][index]['summary']
    inputs=tokenizer(dialogue,return_tensors='pt')
    output=tokenizer.decode(model.generate(inputs["input_ids"],max_new_tokens=50,)[0],skip_special_tokens=True)
    print(dash_line)
    print('Example ',i+1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}')
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['summary'])
    print(f'MODEL GENERATION -WITHOUT PROMPT ENGINEERING:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
MODEL GENERATION -WITHOUT PROMPT ENGINEERING:
Person1: It's ten to nine.

---------------------------------------------------------------------------------------------------
Example  2
---------------------

In [24]:
for i,index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary= dataset['test'][index]['summary']
    prompt =f"""
Summarize the following conversation.
{dialogue}
summary:
    """
    inputs=tokenizer(prompt,return_tensors='pt')
    output=tokenizer.decode(model.generate(inputs["input_ids"],max_new_tokens=50,)[0],skip_special_tokens=True)
    print(dash_line)
    print('Example ',i+1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['summary'])
    print(f'MODEL GENERATION -ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
summary:
    
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
MODEL GENERATION -ZERO SHOT:
The train is about to leave.

----------------------------------------------------------------------------------------------

In [26]:
#one shot
def make_prompt(example_indices_full,example_index_to_summarize):
    prompt=''
    for index in example_indices_full:
        dialogue=dataset['test'][index]['dialogue']
        summary=dataset['test'][index]['summary']
        prompt+= f"""
Dialogue:

{dialogue}

what was going on?
{summary}



"""
        dialogue=dataset['test'][example_index_to_summarize]['dialogue']
        prompt=f"""
Dialogue:

{dialogue}

what was going on?
"""
    return prompt


In [27]:
example_indices_full=[40]
example_index_to_summarize=200
one_shot_prompt=make_prompt(example_indices_full,example_index_to_summarize)
print(one_shot_prompt)


Dialogue:

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

what was going on?



In [33]:
summary=dataset['test'][example_index_to_summarize]['summary']
inputs=tokenizer(one_shot_prompt,return_tensors='pt')
output=tokenizer.decode(model.generate(inputs["input_ids"],max_new_tokens=50,)[0],skip_special_tokens=True)
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ONE SHOT:
#Person1#: You could upgrade your system. #Person2#: That would be a bonus. #Person1#: You could also upgrade your hardware. #Person2#: You might also want to add


In [34]:
example_indices_full=[40,80,120]
example_index_to_summarize=200
few_shot_prompt=make_prompt(example_indices_full,example_index_to_summarize)
print(few_shot_prompt)


Dialogue:

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

what was going on?



In [35]:
summary=dataset['test'][example_index_to_summarize]['summary']
inputs=tokenizer(few_shot_prompt,return_tensors='pt')
output=tokenizer.decode(model.generate(inputs["input_ids"],max_new_tokens=50,)[0],skip_special_tokens=True)
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1#: You could upgrade your system. #Person2#: That would be a bonus. #Person1#: You could also upgrade your hardware. #Person2#: You might also want to add


In [42]:
generation_config =GenerationConfig(max_new_tokens=60)
#generation_config =GenerationConfig(max_new_tokens=10)
#generation_config =GenerationConfig(max_new_tokens=50,do_sample=True,temperature=0.1)
#generation_config =GenerationConfig(max_new_tokens=60,do_sample=True,temperature=0.5)
#generation_config =GenerationConfig(max_new_tokens=60,do_sample=True,temperature=1.0)
inputs=tokenizer(few_shot_prompt,return_tensors='pt')
output=tokenizer.decode(model.generate(inputs["input_ids"],generation_config=generation_config,)[0],skip_special_tokens=True)
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')

---------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1#: You could upgrade your system. #Person2#: That would be a bonus. #Person1#: You could also upgrade your hardware. #Person2#: You might also want to add a CD-ROM drive.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

