# Retrieve Information from Specific Data Corpus

In [97]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set up Azure OpenAI

In [98]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv()
openai.api_type = "azure"
openai.api_base = "" # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
openai.api_version = "2022-12-01"
openai.api_key = os.getenv("OPENAI_API_KEY")

True

## Deploy a Language Model

In [99]:
# list models deployed with embeddings capability
deployment_id = None
result = openai.Deployment.list()

for deployment in result.data:
    if deployment["status"] != "succeeded":
        continue
    
    model = openai.Model.retrieve(deployment["model"])
    if model["capabilities"]["embeddings"] != True:
        continue
    
    deployment_id = deployment["id"]
    break

# if not model deployed, deploy one
if not deployment_id:
    print('No deployment with status: succeeded found.')
    model = "text-similarity-davinci-001"

    # Now let's create the deployment
    print(f'Creating a new deployment with model: {model}')
    result = openai.Deployment.create(model=model, scale_settings={"scale_type":"standard"})
    deployment_id = result["id"]
    print(f'Successfully created {model} with deployment_id {deployment_id}')
else:
    print(f'Found a succeeded deployment that supports embeddings with id: {deployment_id}.')

Found a succeeded deployment that supports embeddings with id: deployment-89153abdfa934e1580296dbee586239b.


## Load Data
The next cell will load embeddings generated in notebook [01-get-embeddings.ipynb](./01-get-embeddings.ipynb).

In [100]:
import pandas as pd
fname = '../data/bbc-news-data-embedding.csv'
df_orig = pd.read_csv(fname, delimiter='\t', index_col=False)

In [101]:
import numpy as np

DEVELOPMENT = False  # Set this to True for development on small subset of data

if DEVELOPMENT:
    # Sub-sample for development
    df = df_orig.sample(n=20, replace=False, random_state=9).copy() # Set sample size
else:
    df = df_orig.copy()

# drop rows with NaN
df.dropna(inplace=True)

# convert string to array
df["embedding"] = df['embedding'].apply(eval).apply(np.array)
df

Unnamed: 0,category,filename,title,content,embedding
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,"[-0.0012276918860152364, 0.00733763724565506, ..."
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,"[0.0009311728645116091, 0.014099937863647938, ..."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"[-0.010487922467291355, 0.009665092453360558, ..."
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,"[0.0111119095236063, 0.004624682944267988, -0...."
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,"[-0.0021637482568621635, 0.005410161800682545,..."
...,...,...,...,...,...
2219,tech,396.txt,New consoles promise big problems,Making games for future consoles will require...,"[0.014879594556987286, 0.004789963364601135, -..."
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,"[0.007671569474041462, 0.00624304823577404, -0..."
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,"[0.0026338498573750257, 0.015989987179636955, ..."
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...,"[0.007126151118427515, 0.008495588786900043, -..."


## Count Tokens

In [103]:
import tiktoken
encoding = tiktoken.get_encoding('gpt2')

df['token_count'] = ''

for idx, title, content in zip(df.index.values, df['title'].loc[df.index.values], df['content'].loc[df.index.values]):
    df['token_count'].loc[idx] = len(encoding.encode(content))

df

Unnamed: 0,category,filename,title,content,embedding,token_count
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,"[-0.0012276918860152364, 0.00733763724565506, ...",553
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,"[0.0009311728645116091, 0.014099937863647938, ...",457
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"[-0.010487922467291355, 0.009665092453360558, ...",339
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,"[0.0111119095236063, 0.004624682944267988, -0....",541
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,"[-0.0021637482568621635, 0.005410161800682545,...",382
...,...,...,...,...,...,...
2219,tech,396.txt,New consoles promise big problems,Making games for future consoles will require...,"[0.014879594556987286, 0.004789963364601135, -...",1215
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,"[0.007671569474041462, 0.00624304823577404, -0...",506
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,"[0.0026338498573750257, 0.015989987179636955, ...",432
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...,"[0.007126151118427515, 0.008495588786900043, -...",1208


## Find Articles with Similar Embeddings to that of the Question

In [104]:
import numpy as np

def get_embedding(text, deployment_id=deployment_id):
    """ 
    Get embeddings for an input text from the dataframe. 
    """
    result = openai.Embedding.create(
      deployment_id=deployment_id,
      input=text
    )
    result = np.array(result["data"][0]["embedding"])
    return result

def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    similarity = np.dot(x, y)
    return similarity 

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated articles embeddings
    to find the most relevant articles. 
    Return the list of articles, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

## Retrieve Relevant Articles 

In [105]:
def retrieve_relevant_documents(query, contexts=df['embedding']):
    # find text most similar to the query
    answers = order_document_sections_by_query_similarity(query=query, contexts=contexts)[0:3] # Set to top 3

    # print top 3
    for answer in answers:
        print(f'similarity score:   {answer[0]}')
        print(df['content'].loc[answer[1]], '\n')

    return

## About Token

In [106]:
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

## Construct Prompt
Add relevant context to the query prompt.

In [118]:
MAX_SECTION_LEN = 1000 # Set maximum token for context text

def construct_prompt(query: str, context_embeddings: pd.DataFrame, df: pd.DataFrame) -> str:
    """
    Append sections of document that are most similar to the query.
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(query, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts up to MAX_SECTION_LEN
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section['token_count'] + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
      
    print(f"Selected {len(chosen_sections)} document sections, with indexes:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question truthfully using context, if unsure, say "I don't know."\n\nContext:\n"""
    prompt = header + "".join(chosen_sections) + "\n\n Q: " + query + "\n A:"
    
    return prompt

In [119]:
query = 'News about stock market.'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)

Selected 4 document sections, with indexes:
364
266
273
282
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  The owner of the technology-dominated Nasdaq stock index plans to sell shares to the public and list itself on the market it operates.  According to a registration document filed with the Securities and Exchange Commission, Nasdaq Stock Market plans to raise $100m (£52m) from the sale. Some observers see this as another step closer to a full public listing. However Nasdaq, an icon of the 1990s technology boom, recently poured cold water on those suggestions.  The company first sold shares in private placements during 2000 and 2001. It technically went public in 2002 when the stock started trading on the OTC Bulletin Board, which lists equities that trade only occasionally. Nasdaq will not make money from the sale, only investors who bought shares in the private placings, the filing documents said. The Nasdaq is made up shares in technolo

## Retrieve Information

In [120]:
def retrieve_information(prompt):
    try:
        # Request API
        response = openai.Completion.create(
            deployment_id= "text-davinci-003", # has to be deployment_id
            prompt=prompt,
            temperature=1,
            max_tokens=100,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=1
        )

        # response
        result = response['choices'][0]['text']; print(result)
    except Exception as err:
        print(idx)
        print(f"Unexpected {err=}, {type(err)=}")

    return 

## Query Samples

In [121]:
query = 'News about stock market.'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 4 document sections, with indexes:
364
266
273
282
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  The owner of the technology-dominated Nasdaq stock index plans to sell shares to the public and list itself on the market it operates.  According to a registration document filed with the Securities and Exchange Commission, Nasdaq Stock Market plans to raise $100m (£52m) from the sale. Some observers see this as another step closer to a full public listing. However Nasdaq, an icon of the 1990s technology boom, recently poured cold water on those suggestions.  The company first sold shares in private placements during 2000 and 2001. It technically went public in 2002 when the stock started trading on the OTC Bulletin Board, which lists equities that trade only occasionally. Nasdaq will not make money from the sale, only investors who bought shares in the private placings, the filing documents said. The Nasdaq is made up shares in technolo

In [122]:
query = 'What is the state of the economy of the world?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 3 document sections, with indexes:
93
72
321
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  The US economy has grown more than expected, expanding at an annual rate of 3.8% in the last quarter of 2004.  The gross domestic product figure was ahead of the 3.1% the government estimated a month ago. The rise reflects stronger spending by businesses on capital equipment and a smaller-than-expected trade deficit. GDP is a measure of a country's economic health, reflecting the value of the goods and services it produces.  The new GDP figure, announced by the Commerce Department on Friday, also topped the 3.5% growth rate that economists had forecast ahead of Friday's announcement. Growth was at an annual rate of 4% in the third quarter of 2004 and for the year it came in at 4.4%, the best figure in five years. However, the positive economic climate may lead to a rise in interest rates, with many expecting US rates to rise on 22 March. In th

In [123]:
query = 'Summarise the state of the economy of the world?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 3 document sections, with indexes:
72
97
93
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  Germany's economy shrank 0.2% in the last three months of 2004, upsetting hopes of a sustained recovery.  The figures confounded hopes of a 0.2% expansion in the fourth quarter in Europe's biggest economy. The Federal Statistics Office said growth for the whole of 2004 was 1.6%, after a year of contraction in 2003, down from an earlier estimate of 1.7%. It said growth in the third quarter had been zero, putting the economy at a standstill from July onward. Germany has been reliant on exports to get its economy back on track, as unemployment of more than five million and impending cuts to welfare mean German consumers have kept their money to themselves. Major companies including Volkswagen, DaimlerChrysler and Siemens have spent much of 2004 in tough talks with unions about trimming jobs and costs. According to the statistics office, Destatis, 

In [124]:
query = 'What is the most talked about technology?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 3 document sections, with indexes:
2085
2062
2154
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  More and more Americans are joining the internet's fast lane, according to official figures.  The number of people and business connected to broadband jumped by 38% in a year, said the US Federal Communications Commission (FCC). In a report, it said there were more than 32 million broadband connections by the end of June 2004. But the US is still behind compared to other nations, ranked 13th in the world by a UN telecoms body.  During his 2004 re-election campaign, President George W Bush pledge to ensure that affordable high-speed net access would be available to all Americans by 2007.  According to the report by the FCC, broadband is becoming increasingly popular, with people using it for research and shopping, as well as downloading music and watching video. The total number of people and businesses on broadband rose by to 32.5 million

In [125]:
query = 'List all the celebrities.'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 2 document sections, with indexes:
601
604
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  US actor Jamie Foxx has been given two nominations for Golden Globe awards, with Meryl Streep, Morgan Freeman and Cate Blanchett also up for prizes.  The stars were shortlisted on Monday for supporting roles, with the main nominations still to come. Foxx has starred in Collateral and Ray. Clive Owen, David Carradine and Natalie Portman are also up for awards. The Golden Globes, Hollywood's second most prominent awards, are the first major nominations to be announced. Last year, The Lord Of the Rings: The Return Of the King was named best drama movie while Lost In Translation won best musical or comedy. Sean Penn, Charlize Theron, Tim Robbins and Renee Zellweger all won acting awards - mirroring the eventual Oscars outcome. The Golden Globes ceremony will take place on 16 January, with the Oscars following on 27 February. 
*  The Aviator has been

In [126]:
query = 'What are the sports in the news?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 3 document sections, with indexes:
309
1330
1549
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  Nike has reported its best second-quarter earnings, helped by strong demand for its athletic shoes and Converse sneakers.  The global sports giant said it posted a profit of $261.9m (£135.6m), for the three months to 30 November, up from $179.1m in the same period last year. Revenues increased 11% to $3.1bn, from $2.8bn for the same period in 2003. Nike, whose products are endorsed by Tiger Woods among other sports stars, said "demand continues to grow". The results came after a strong first quarter of the year for the firm based in Beaverton, Oregon.  Philip Knight, chairman and chief executive, said: "Nike's second-quarter revenues and earnings per share reached all-time high levels as a result of solid performance across our global portfolio. "Our businesses in the United States and emerging markets such as China, Russia and Turkey, com

In [127]:
query = 'Tell me about all the football games.'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 2 document sections, with indexes:
1549
1621
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  All four of England's Champions League representatives have reached the knockout stages for the first time.  Arsenal and Chelsea are seeded as group winners, while runners-up Manchester United and Liverpool are not. Rules stipulate that teams from the same country or group will be kept apart in the draw on 17 December. The favourites are Chelsea and Barcelona, and Real Madrid, the two Milan sides, Juventus and Bayern Munich are among the 16 still in the hat.  Steven Gerrard's last-gasp wonder-strike secured qualification for  against Olympiakos on Wednesday evening.  AC Milan, Bayer Leverkusen, Internazionale, Juventus, Lyon.  who had already qualified, fielded a second-string side and went down 3-0 to Fenerbahce.  AC Milan, Bayer Leverkusen, Internazionale, Juventus, Monaco.  On Tuesday,  finished top of their group with a 5-1 win over the Ro

In [128]:
query = 'Who won in the swimming contest?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 3 document sections, with indexes:
1330
698
841
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  Teenager LaShawn Merritt ran the third fastest indoor 400m of all time at the Fayetteville Invitational meeting.  The world junior champion clocked 44.93 seconds to finish well clear of fellow American Bershawn Jackson in Arkansas. Only Michael Johnson has gone quicker, setting the world record of 44.63secs in 1995 and running 44.66secs in 1996. Kenyan Bernard Lagat missed out on the world record by 1.45secs as he ran the third quickest indoor mile ever to beat Canada's Nate Brannen by almost 10secs. The Olympic silver medallist's time of three minutes 49.89secs was inferior only to the 1997 world record of Moroccan Hicham El Guerrouj and former world record holder Eamonn Coghlan of Ireland's 3:49.78. Lagat was on course to break El Guerrouj's record through 1200m but could not maintain the pace over the final 400m. Ireland's  continued his

In [129]:
query = 'Who won in the running competition?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 3 document sections, with indexes:
1330
1354
1388
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

*  Teenager LaShawn Merritt ran the third fastest indoor 400m of all time at the Fayetteville Invitational meeting.  The world junior champion clocked 44.93 seconds to finish well clear of fellow American Bershawn Jackson in Arkansas. Only Michael Johnson has gone quicker, setting the world record of 44.63secs in 1995 and running 44.66secs in 1996. Kenyan Bernard Lagat missed out on the world record by 1.45secs as he ran the third quickest indoor mile ever to beat Canada's Nate Brannen by almost 10secs. The Olympic silver medallist's time of three minutes 49.89secs was inferior only to the 1997 world record of Moroccan Hicham El Guerrouj and former world record holder Eamonn Coghlan of Ireland's 3:49.78. Lagat was on course to break El Guerrouj's record through 1200m but could not maintain the pace over the final 400m. Ireland's  continued h