# Summarise Documents

In [8]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set up Azure OpenAI

In [9]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv()
openai.api_type = "azure"
openai.api_base = "" # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
openai.api_version = "2022-12-01"
openai.api_key = os.getenv("OPENAI_API_KEY")

True

## Load Data

In [10]:
import pandas as pd

df_orig = pd.read_csv("../data/bbc-news-data.csv", delimiter='\t', index_col=False)

In [11]:
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Create prompt

In [12]:
prompt_postfix = """ 
  \n\nTl;dr
"""

prompt = df['title'].loc[0] + "\n" + df['content'].loc[0] + prompt_postfix
prompt

'Ad sales boost Time Warner profit\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up 

## Requst to API

In [13]:
# Request API
response = openai.Completion.create(
  deployment_id="text-davinci-003", 
  prompt=prompt,
  temperature=1,
  max_tokens=100,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=1
)

# print response
response['choices'][0]['text']

"Time Warner's quarterly profits jumped by 76% to $1.13 billion, benefiting from sales of high-speed internet connections and higher advertisement sales. Fourth quarter sales rose 2%, however AOL suffered a profit dip, but was offset by one-offs from the firm owning 8% of Google. Their film division took a hit from box office flops, but for the year their profits were still up 27% with revenue growth at 6.4%, offering 5% earnigs growth projections in the coming"

### Putting the codes together

In [None]:
results = pd.DataFrame(columns=['summary'], index=df.index)

# prompt postifx
prompt_postfix = """ 
  \n\nTl;dr
"""

for idx, title, content in zip(df.index.values, df['title'].loc[df.index.values], df['content'].loc[df.index.values]):
  
  # build prompt
  prompt = title + "\n" + content + prompt_postfix

  try:
    # Request API
    response = openai.Completion.create(
      deployment_id="text-davinci-003", # has to be deployment_id
      prompt=prompt,
      temperature=1,
      max_tokens=100,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=1
    )

      # response
    results['summary'].loc[idx] = response['choices'][0]['text']
  except Exception as err:
    idx
    print(f"Unexpected {err=}, {type(err)=}")

### Results

In [15]:
results

Unnamed: 0,summary
0,Time Warner's quarterly profits surged 76% to ...
1,The dollar has recently reached its highest le...
2,Yukos's owners are demanding repayment of a $9...
3,British Airways reported a 40% drop in profits...
4,Shares of Allied Domecq rose on speculation th...
...,...
2220,BT is introducing two initiatives to protect ...
2221,A new report shows that many computer users ac...
2222,\nIf the new European Directive on the Patenta...
2223,Amit Yoran has resigned from his post as direc...


### Adding results to dataframe

In [16]:
df_results = pd.concat([df, results], axis=1)
df_results.shape
df_results

(2225, 5)

Unnamed: 0,category,filename,title,content,summary
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,Time Warner's quarterly profits surged 76% to ...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,The dollar has recently reached its highest le...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,Yukos's owners are demanding repayment of a $9...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,British Airways reported a 40% drop in profits...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,Shares of Allied Domecq rose on speculation th...
...,...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,BT is introducing two initiatives to protect ...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,A new report shows that many computer users ac...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...,\nIf the new European Directive on the Patenta...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...,Amit Yoran has resigned from his post as direc...


## Save results

In [18]:
fname = '../output/summaries.csv'
df_results.to_csv(fname, sep='\t')