In [1]:
%load_ext autoreload
%autoreload 2
from data import load_articles, save_articles, load_summaries, save_summaries
from model_wrappers import GPTWrapper, ClaudeWrapper, Llama3Wrapper, Gemma2Wrapper
from tqdm import tqdm
from data import Summary, save_summaries, load_summaries, load_from_json, save_to_json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
articles = load_articles()
len(articles)

2000

In [3]:
article = articles['35232142']
article



In [4]:
gpt35 = GPTWrapper('gpt35', 'gpt-3.5-turbo')
opus = ClaudeWrapper('claude', 'claude-3-opus-20240229')

In [7]:
llama2_13b = Llama3Wrapper('llama2_13b', 'meta-llama/Llama-2-13b-chat-hf')
# gemma2_9b = Gemma2Wrapper('gemma2_7b', 'google/gemma-2-9b')

Loading checkpoint shards: 100%|██████████| 3/3 [00:25<00:00,  8.37s/it]


In [5]:
summaries = load_summaries()
len(summaries)

14000

In [14]:
summaries = load_summaries()
print(len(summaries))
for summary in tqdm(summaries):
    if summary.source != llama2_13b.model_id:
        continue
    while any(summary.text.startswith(s) for s in ['Please provide ', 'Note: ', 'Hint: ', '[', '(', 'Please note']):
        summary.text = llama2_13b.summarize(articles[summary.article_id])
    save_summaries(summaries)

14000


  0%|          | 0/14000 [00:00<?, ?it/s]

 15%|█▌        | 2128/14000 [00:28<05:17, 37.37it/s]  The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 14000/14000 [07:30<00:00, 31.06it/s] 


In [6]:
llama2_13b.summarize(article)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'Please provide a one-sentence summary of the article in 30 words or less.'

In [9]:
llama2_13b.summarize(article)

"Here's a one-sentence summary of the article:\n\nThe recent flooding in Scotland has caused significant damage, with the River Cree overflowing in Newton Stewart and many roads in Peeblesshire remaining badly affected by standing water."

Summary(dataset='xsum', article_id='35232142', source='llama3_8b', text='Flooding in Newton Stewart, Hawick, and Peebles has caused widespread damage, with many businesses and homes affected, and officials working to assess the full cost of the damage and implement flood prevention measures.')

In [13]:
summaries = load_summaries()
for article in tqdm(articles.values()):
    summary = Summary(
        dataset=article.dataset,
        article_id=article.id,
        source=llama2_13b.model_id,
        text=llama2_13b.summarize(article)
    )
    summaries.append(summary)
    save_summaries(summaries)

 38%|███▊      | 760/2000 [35:00<50:46,  2.46s/it]  This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
100%|██████████| 2000/2000 [2:14:27<00:00,  4.03s/it]  


In [12]:
keys = list(articles.keys())
for key in tqdm(keys):
    
    print(llama3_8b.summarize(articles[key]))
    print()

Here are the highlights:

Inmates with severe mental illnesses are housed on the "forgotten floor" of Miami's pretrial detention facility.
Most are charged with drug-related or assault-related crimes, often due to confrontations with police.
Judge Steven Leifman advocates for justice and mental health treatment, not punishment, for these inmates.
A new mental health facility aims to provide long-term treatment, reducing recidivism and saving the state money.

Here are the highlights:

The Minneapolis bridge collapsed, sending cars into the Mississippi River and leaving many injured.
Gary Babineau, a survivor, described the chaos, saying "the whole bridge from one side to the other just completely gave way."
He helped rescue children from a school bus, saying "I just knew what I had to do at the moment."
Rescue efforts were slow and challenging, with many people trapped in the rubble.

Here are the highlights:

President Bush had five small polyps removed from his colon during a colonos

KeyboardInterrupt: 

In [19]:
# Converting old data
ids = ['claude', 'gpt4', 'gpt35', 'human', 'llama']

summaries = load_summaries()
for id in ids:
    old_summaries_cnn = load_from_json(f'cnn_train_{id}_responses.json')
    old_summaries_xsum = load_from_json(f'xsum_train_{id}_responses.json')

    if id == 'llama':
        id = 'llama2_7b'
    summaries.extend([Summary(dataset='cnn', article_id=key, source=id, text=old_summaries_cnn[key]) for key in old_summaries_cnn])
    summaries.extend([Summary(dataset='xsum', article_id=key, source=id, text=old_summaries_xsum[key]) for key in old_summaries_xsum])

save_summaries(summaries)

In [23]:
(set([summary.source for summary in summaries]))

{'claude', 'gpt35', 'gpt4', 'human', 'llama2_13b', 'llama2_7b', 'llama3_8b'}

In [21]:
import re
def clean_summary(highlights):
    lines = highlights.split("\n")

    # Applying all the cleaning rules:
    # - Remove blank lines
    # - Remove lines with specific phrases or ending with a colon
    # - Remove initial numbers like "1. ", trailing whitespace, periods, and leading "- "
    cleaned_lines = []
    for line in lines:
        line = line.strip()  # Remove leading and trailing whitespace
        if (
            line
            and not any(
                string in line.lower()
                for string in [
                    "points from the summary",
                    "highlights from the summary",
                    "summary of the article",
                    "highlights from the article",
                    "highlight summaries of the",
                    "highlights of the article",
                ]
            )
            and not line.endswith(":")
            and not any(
                line.startswith(s)
                for s in [
                    'Note:',
                    'Please'
                ]
            )
        ):
            line = re.sub(r"^\d+\.\s+", "", line)  # Remove initial numbers like "1. "
            line = line.lstrip("* ")  # Remove leading "* "
            line = line.lstrip('• ') # Remove leading bullet point
            line = line.rstrip(".")  # Remove trailing period
            line = line.replace(
                '."\n', '"\n'
            )  # Remove trailing period before end-of-line quotation mark
            if line.endswith(
                '."'
            ):  # Remove trailing period before final quotation mark
                line = line[:-2] + line[-1]
            line = line.lstrip(
                "- "
            ).lstrip()  # Remove leading "- " and any extra whitespace
            cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

In [22]:
llama_2_13_summaries = [summary for summary in summaries if summary.source == 'llama2_13b']
len(llama_2_13_summaries)

2000

In [28]:
summaries = load_summaries()
for summary in summaries:
    if summary.source != 'llama3_8b':
        continue
    summary.text = clean_summary(summary.text)
save_summaries(summaries)

In [23]:
new_summaries = [clean_summary(summary.text) for summary in llama_2_13_summaries]

In [26]:
for old, new in zip(llama_2_13_summaries, new_summaries):
    if not new.strip():
        print(old.text)
        print('---------------------------------------------------------------------')
        print(new)
        print('\n-----------------------\n')

In [25]:
print('\n---\n'.join([s for s in new_summaries]))

Flooding in Scotland has caused damage and disruption, with a retaining wall breach in Newton Stewart causing significant flooding, and a flood alert remaining in place across the Borders
---
A fire broke out at the Holiday Inn in Hope Street, engulfing two parked buses and destroying personal belongings, with the cause believed to be deliberate
---
Mercedes' Lewis Hamilton won the pole position for the Bahrain Grand Prix, beating teammate Nico Rosberg by just 0.077 seconds, with Ferrari's Sebastian Vettel and Kimi Raikkonen splitting the silver cars
---
Former scout leader John Edward Bates faces 22 charges of indecency with a child, including allegations of sexual abuse during his time as a scout leader in Lincolnshire and Cambridgeshire
---
A man receiving psychiatric treatment at a hospital in Istanbul threatened to shoot himself and others, leading to an evacuation of the facility and a negotiator to be deployed
---
Glasgow Warriors defeated Dragons 24-17 in a messy match, with Si