# Azure embeddings example


#### Load API + Key from .env file

In [1]:
import os
import json
from dotenv import load_dotenv
load_dotenv() 

True

#### Download dataset

#### download directly with curl or use python 
#### curl "https://raw.githubusercontent.com/Azure-Samples/Azure-OpenAI-Docs-Samples/main/Samples/Tutorials/Embeddings/data/bill_sum_data.csv" --output ../data/bill_sum_data.csv

In [30]:
def fetch_dataset(url_source, target_file):
    """Fetch the a dataset via URL

    This uses a particular URL for the dataset. The code
    is a simplified version of fetch_openml() in sklearn.

    """
    try:
        from urllib import urlretrieve
    except ImportError:
        from urllib.request import urlretrieve

    filename = target_file
    data_url = url_source
    urlretrieve(data_url, filename)


In [5]:
# download the dataset using urllib and save locally
sample_data = "https://raw.githubusercontent.com/Azure-Samples/Azure-OpenAI-Docs-Samples/main/Samples/Tutorials/Embeddings/data/bill_sum_data.csv"
target_dir = "../data"
target_file = "bill_sum_data.csv"

import os
os.makedirs(target_dir, exist_ok=True)
fetch_dataset(sample_data, os.path.join(target_dir, target_file))

#### Import libraries and list model

In [3]:
import openai
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast

API_KEY = os.getenv("AZURE_OPENAI_KEY") 
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

url = openai.api_base + "/openai/deployments?api-version=2022-12-01"

r = requests.get(url, headers={"api-key": API_KEY})

print(r.text)

{
  "data": [
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-davinci-002",
      "owner": "organization-owner",
      "id": "Test_Davinci",
      "status": "succeeded",
      "created_at": 1660002143,
      "updated_at": 1660002143,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-search-curie-query-001",
      "owner": "organization-owner",
      "id": "text-search-curie-query-001",
      "status": "succeeded",
      "created_at": 1672986097,
      "updated_at": 1672986097,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-search-curie-doc-001",
      "owner": "organization-owner",
      "id": "text-search-curie-doc-001",
      "status": "succeeded",
      "created_at": 1672986119,
      "updated_at": 1672986119,
      "object": "deployment"
    },
    {
      "scale_settings":

### Read in data

In [6]:
df = pd.read_csv(os.path.join(target_dir, target_file)) # example: df = pd.read_csv("data/bill_sum_data.csv");df
df

Unnamed: 0.1,Unnamed: 0,bill_id,text,summary,title,text_len,sum_len
0,0,110_hr37,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,National Science Education Tax Incentive for B...,To amend the Internal Revenue Code of 1986 to ...,8494,321
1,1,112_hr2873,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Small Business Expansion and Hiring Act of 201...,To amend the Internal Revenue Code of 1986 to ...,6522,1424
2,2,109_s2408,SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...,Requires the Director of National Intelligence...,A bill to require the Director of National Int...,6154,463
3,3,108_s1899,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,National Cancer Act of 2003 - Amends the Publi...,A bill to improve data collection and dissemin...,19853,1400
4,4,107_s1531,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Military Call-up Relief Act - Amends the Inter...,A bill to amend the Internal Revenue Code of 1...,6273,278
5,5,107_hr4541,SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...,Requires the Customs Service to reliquidate ce...,To provide for reliquidation of entries premat...,11691,114
6,6,111_s1495,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Service Dogs for Veterans Act of 2009 - Direct...,A bill to require the Secretary of Veterans Af...,5328,379
7,7,111_s3885,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Race to the Top Act of 2010 - Directs the Secr...,A bill to provide incentives for States and lo...,16668,1525
8,8,113_hr1796,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Troop Talent Act of 2013 - Directs the Secreta...,Troop Talent Act of 2013,15352,2151
9,9,103_hr1987,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Taxpayer's Right to View Act of 1993 - Amends ...,Taxpayer's Right to View Act of 1993,5633,894


### Create subset

In [7]:
df_bills = df[['text', 'summary', 'title']]
df_bills

Unnamed: 0,text,summary,title
0,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,National Science Education Tax Incentive for B...,To amend the Internal Revenue Code of 1986 to ...
1,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Small Business Expansion and Hiring Act of 201...,To amend the Internal Revenue Code of 1986 to ...
2,SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...,Requires the Director of National Intelligence...,A bill to require the Director of National Int...
3,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,National Cancer Act of 2003 - Amends the Publi...,A bill to improve data collection and dissemin...
4,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Military Call-up Relief Act - Amends the Inter...,A bill to amend the Internal Revenue Code of 1...
5,SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...,Requires the Customs Service to reliquidate ce...,To provide for reliquidation of entries premat...
6,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Service Dogs for Veterans Act of 2009 - Direct...,A bill to require the Secretary of Veterans Af...
7,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Race to the Top Act of 2010 - Directs the Secr...,A bill to provide incentives for States and lo...
8,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Troop Talent Act of 2013 - Directs the Secreta...,Troop Talent Act of 2013
9,SECTION 1. SHORT TITLE.\r\n\r\n This Act ma...,Taxpayer's Right to View Act of 1993 - Amends ...,Taxpayer's Right to View Act of 1993


### Data cleaning

In [8]:
# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

df_bills['text'] = df_bills["text"].apply(lambda x : normalize_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bills['text'] = df_bills["text"].apply(lambda x : normalize_text(x))


#### remove any bills that are too long for the token limit (~2000 tokens).

In [9]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
df_bills['n_tokens'] = df_bills["text"].apply(lambda x: len(tokenizer.encode(x)))
df_bills = df_bills[df_bills.n_tokens<2000]
len(df_bills)

Token indices sequence length is longer than the specified maximum sequence length for this model (1480 > 1024). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bills['n_tokens'] = df_bills["text"].apply(lambda x: len(tokenizer.encode(x)))


12

In [10]:
df_bills

Unnamed: 0,text,summary,title,n_tokens
0,SECTION 1. SHORT TITLE. This Act may be cited ...,National Science Education Tax Incentive for B...,To amend the Internal Revenue Code of 1986 to ...,1480
1,SECTION 1. SHORT TITLE. This Act may be cited ...,Small Business Expansion and Hiring Act of 201...,To amend the Internal Revenue Code of 1986 to ...,1152
2,SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...,Requires the Director of National Intelligence...,A bill to require the Director of National Int...,930
4,SECTION 1. SHORT TITLE. This Act may be cited ...,Military Call-up Relief Act - Amends the Inter...,A bill to amend the Internal Revenue Code of 1...,1048
5,SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...,Requires the Customs Service to reliquidate ce...,To provide for reliquidation of entries premat...,1846
6,SECTION 1. SHORT TITLE. This Act may be cited ...,Service Dogs for Veterans Act of 2009 - Direct...,A bill to require the Secretary of Veterans Af...,872
9,SECTION 1. SHORT TITLE. This Act may be cited ...,Taxpayer's Right to View Act of 1993 - Amends ...,Taxpayer's Right to View Act of 1993,946
12,SECTION 1. FINDINGS. The Congress finds the fo...,Amends the Marine Mammal Protection Act of 197...,To amend the Marine Mammal Protection Act of 1...,1223
14,SECTION 1. SHORT TITLE. This Act may be cited ...,Education and Training for Health Act of 2017 ...,Education and Training for Health Act of 2017,1596
16,SECTION 1. SHORT TITLE. This Act may be cited ...,Andrew Prior Act or Andrew's Law - Amends the ...,Andrew's Law,608


In [11]:
understand_tokenization = tokenizer.tokenize(df_bills.text[0])
understand_tokenization

['S',
 'ECTION',
 'Ġ1',
 '.',
 'ĠSH',
 'ORT',
 'ĠTIT',
 'LE',
 '.',
 'ĠThis',
 'ĠAct',
 'Ġmay',
 'Ġbe',
 'Ġcited',
 'Ġas',
 'Ġthe',
 'Ġ``',
 'National',
 'ĠScience',
 'ĠEducation',
 'ĠTax',
 'ĠIn',
 'cent',
 'ive',
 'Ġfor',
 'ĠBusiness',
 'es',
 'ĠAct',
 'Ġof',
 'Ġ2007',
 "''.",
 'ĠSEC',
 '.',
 'Ġ2',
 '.',
 'ĠCR',
 'EDIT',
 'S',
 'ĠFOR',
 'ĠC',
 'ER',
 'TAIN',
 'ĠCONTR',
 'IB',
 'UT',
 'IONS',
 'ĠBEN',
 'EF',
 'IT',
 'ING',
 'ĠSC',
 'IENCE',
 ',',
 'ĠTECH',
 'N',
 'OLOGY',
 ',',
 'ĠENG',
 'INE',
 'ER',
 'ING',
 ',',
 'ĠAND',
 'ĠM',
 'ATH',
 'EM',
 'AT',
 'ICS',
 'ĠED',
 'UC',
 'ATION',
 'ĠAT',
 'ĠTHE',
 'ĠELE',
 'MENT',
 'ARY',
 'ĠAND',
 'ĠSEC',
 'OND',
 'ARY',
 'ĠSCHOOL',
 'ĠLEVEL',
 '.',
 'Ġ(',
 'a',
 ')',
 'ĠIn',
 'ĠGeneral',
 '.--',
 'Sub',
 'part',
 'ĠD',
 'Ġof',
 'Ġpart',
 'ĠIV',
 'Ġof',
 'Ġsub',
 'chapter',
 'ĠA',
 'Ġof',
 'Ġchapter',
 'Ġ1',
 'Ġof',
 'Ġthe',
 'ĠInternal',
 'ĠRevenue',
 'ĠCode',
 'Ġof',
 'Ġ1986',
 'Ġ(',
 'rel',
 'ating',
 'Ġto',
 'Ġbusiness',
 'Ġrelated',
 'Ġcred

In [12]:
len(understand_tokenization)

1480

#### Embed the text documents and save the corresponding embedding. We embed each chunk using a doc model, in this case text-search-curie-doc-001
#### Note: engine name must match the deployment name

In [17]:
df_bills['curie_search'] = df_bills["text"].apply(lambda x : get_embedding(x, engine = 'text-search-curie-doc-001'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bills['curie_search'] = df_bills["text"].apply(lambda x : get_embedding(x, engine = 'text-search-curie-doc-001'))


In [14]:
df_bills

Unnamed: 0,text,summary,title,n_tokens,curie_search
0,SECTION 1. SHORT TITLE. This Act may be cited ...,National Science Education Tax Incentive for B...,To amend the Internal Revenue Code of 1986 to ...,1480,"[-0.019770914688706398, 0.011169900186359882, ..."
1,SECTION 1. SHORT TITLE. This Act may be cited ...,Small Business Expansion and Hiring Act of 201...,To amend the Internal Revenue Code of 1986 to ...,1152,"[-0.007850012741982937, 0.01001765951514244, 0..."
2,SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...,Requires the Director of National Intelligence...,A bill to require the Director of National Int...,930,"[0.00012103027984267101, 0.011845593340694904,..."
4,SECTION 1. SHORT TITLE. This Act may be cited ...,Military Call-up Relief Act - Amends the Inter...,A bill to amend the Internal Revenue Code of 1...,1048,"[-0.005481021944433451, 0.00856819562613964, -..."
5,SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...,Requires the Customs Service to reliquidate ce...,To provide for reliquidation of entries premat...,1846,"[-0.008310390636324883, -0.004660653416067362,..."
6,SECTION 1. SHORT TITLE. This Act may be cited ...,Service Dogs for Veterans Act of 2009 - Direct...,A bill to require the Secretary of Veterans Af...,872,"[-0.017687108367681503, 0.011164870113134384, ..."
9,SECTION 1. SHORT TITLE. This Act may be cited ...,Taxpayer's Right to View Act of 1993 - Amends ...,Taxpayer's Right to View Act of 1993,946,"[0.0021867561154067516, -0.004219848196953535,..."
12,SECTION 1. FINDINGS. The Congress finds the fo...,Amends the Marine Mammal Protection Act of 197...,To amend the Marine Mammal Protection Act of 1...,1223,"[-0.015813011676073074, 0.009919906966388226, ..."
14,SECTION 1. SHORT TITLE. This Act may be cited ...,Education and Training for Health Act of 2017 ...,Education and Training for Health Act of 2017,1596,"[-0.0150684155523777, 0.005073960404843092, 0...."
16,SECTION 1. SHORT TITLE. This Act may be cited ...,Andrew Prior Act or Andrew's Law - Amends the ...,Andrew's Law,608,"[-0.011593054980039597, 0.022752899676561356, ..."


In [15]:
# search through the reviews for a specific product
def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        engine="text-search-curie-query-001"
    )
    df["similarities"] = df.curie_search.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        display(res)
    return res


res = search_docs(df_bills, "can i get information on cable company tax revenue", top_n=4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["similarities"] = df.curie_search.apply(lambda x: cosine_similarity(x, embedding))


Unnamed: 0,text,summary,title,n_tokens,curie_search,similarities
9,SECTION 1. SHORT TITLE. This Act may be cited ...,Taxpayer's Right to View Act of 1993 - Amends ...,Taxpayer's Right to View Act of 1993,946,"[0.0021867561154067516, -0.004219848196953535,...",0.36327
0,SECTION 1. SHORT TITLE. This Act may be cited ...,National Science Education Tax Incentive for B...,To amend the Internal Revenue Code of 1986 to ...,1480,"[-0.019770914688706398, 0.011169900186359882, ...",0.314105
1,SECTION 1. SHORT TITLE. This Act may be cited ...,Small Business Expansion and Hiring Act of 201...,To amend the Internal Revenue Code of 1986 to ...,1152,"[-0.007850012741982937, 0.01001765951514244, 0...",0.297908
18,SECTION 1. SHORT TITLE. This Act may be cited ...,This measure has not been amended since it was...,Veterans Entrepreneurship Act of 2015,1404,"[-0.020315825939178467, 0.0011716989101842046,...",0.295586


In [16]:
res["summary"][9]

"Taxpayer's Right to View Act of 1993 - Amends the Communications Act of 1934 to prohibit a cable operator from assessing separate charges for any video programming of a sporting, theatrical, or other entertainment event if that event is performed at a facility constructed, renovated, or maintained with tax revenues or by an organization that receives public financial support. Authorizes the Federal Communications Commission and local franchising authorities to make determinations concerning the applicability of such prohibition. Sets forth conditions under which a facility is considered to have been constructed, maintained, or renovated with tax revenues. Considers events performed by nonprofit or public organizations that receive tax subsidies to be subject to this Act if the event is sponsored by, or includes the participation of a team that is part of, a tax exempt organization."