# Extract Key Information

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set up Azure OpenAI

In [2]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv()
openai.api_type = "azure"
openai.api_base = "" # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
openai.api_version = "2022-12-01"
openai.api_key = os.getenv("OPENAI_API_KEY")

True

## Load Data

In [3]:
import pandas as pd

df_orig = pd.read_csv("../data/bbc-news-data.csv", delimiter='\t', index_col=False)

In [4]:
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Create Prompt

In [5]:
prompt_prefix = """ 
  Extract keywords from this text
"""

prompt = prompt_prefix + df['title'].loc[0] + "\n" + df['content'].loc[0]
prompt

' \n  Extract keywords from this text\nAd sales boost Time Warner profit\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner inter

## Request to API

In [6]:
response = openai.Completion.create(
  deployment_id="text-davinci-003", # has to be deployment_id
  prompt=prompt,
  temperature=1,
  max_tokens=100,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0
)

# print response
response['choices'][0]['text']

'\n\nKeywords: Time Warner, profit, quarterly, Google, AOL, internet, advertising, subscribers, Securities Exchange Commission, Lord of the Rings, box office, Richard Parsons, restate accounts, AOL Europe, Bertelsmann, advertising revenue.'

## Putting the Codes Together

In [None]:
colname = 'keywords'
results = pd.DataFrame(columns=[colname], index=df.index)

prompt_prefix = """ 
  Extract key words from this text
"""

for idx, title, content in zip(df.index.values, df['title'].loc[df.index.values], df['content'].loc[df.index.values]):
  
  # build prompt
  prompt = prompt_prefix + title + "\n" + content

  try:
    # Request API
    response = openai.Completion.create(
      deployment_id="text-davinci-003", # has to be deployment_id
      prompt=prompt,
      temperature=1,
      max_tokens=100,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=1
    )

      # response
    results[colname].loc[idx] = response['choices'][0]['text']
  except Exception as err:
    idx
    print(f"Unexpected {err=}, {type(err)=}")

## Results

In [8]:
results

Unnamed: 0,keywords
0,"\n\nKey words: Time Warner, Quarterly Profit, ..."
1,"\n\nKey Words: \nDollar, Euro, Federal Reserve..."
2,"\n\nKey words: \nYukos, Rosneft, Yugansk, Mena..."
3,"\n\nKey words: British Airways, Fuel Prices, P..."
4,\n\n•Pernod Ricard •Allied Domecq •Wall Street...
...,...
2220,\n\n- BT Modem Protection Program \n- rogue di...
2221,"\n\nKeywords: spam, e-mails, security warnings..."
2222,"\n\nKey words: software, legal action, Europea..."
2223,\n\n1. US cyber security \n2. Amit Yoran \n3. ...


## Add Results to DataFrame

In [9]:
df_results = pd.concat([df, results], axis=1)
df_results.shape
df_results

(2225, 5)

Unnamed: 0,category,filename,title,content,keywords
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,"\n\nKey words: Time Warner, Quarterly Profit, ..."
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,"\n\nKey Words: \nDollar, Euro, Federal Reserve..."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"\n\nKey words: \nYukos, Rosneft, Yugansk, Mena..."
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,"\n\nKey words: British Airways, Fuel Prices, P..."
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,\n\n•Pernod Ricard •Allied Domecq •Wall Street...
...,...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,\n\n- BT Modem Protection Program \n- rogue di...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,"\n\nKeywords: spam, e-mails, security warnings..."
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...,"\n\nKey words: software, legal action, Europea..."
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...,\n\n1. US cyber security \n2. Amit Yoran \n3. ...


## Save Results

In [10]:
fname = '../output/keywords.csv'
df_results.to_csv(fname, sep='\t')