In [1]:
from openai import AzureOpenAI
from pprint import pprint
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from typing import Any, Generator
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

In [2]:
import duckdb
db = duckdb.connect(database='db.duckdb')
db.sql('show tables')

┌───────────────┐
│     name      │
│    varchar    │
├───────────────┤
│ doc_id_lookup │
│ docs          │
│ upload        │
└───────────────┘

In [3]:
# import pandas as pd
# docs = pd.read_json("spira_docs.json", orient="index").reset_index().rename(columns={"index": "link"}).sort_values('text')
docs = db.sql('from docs order by content').df()
docs.head(10)

Unnamed: 0,link,text,parent,content
0,https://spiradoc.inflectra.com/HowTo-Guides/Us...,AWS CodeBuild,https://spiradoc.inflectra.com/HowTo-Guides/Us...,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
1,https://spiradoc.inflectra.com/Spira-Administr...,AWS CodeBuild,https://spiradoc.inflectra.com/Spira-Administr...,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
2,https://spiradoc.inflectra.com/Spira-User-Manu...,AWS CodeBuild,https://spiradoc.inflectra.com/Spira-User-Manual/,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
3,https://spiradoc.inflectra.com/SpiraApps/../Bu...,AWS CodeBuild,https://spiradoc.inflectra.com/SpiraApps/,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
4,https://spiradoc.inflectra.com/About/introduct...,AWS CodeBuild,https://spiradoc.inflectra.com/About/introduct...,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
5,https://spiradoc.inflectra.com/Reporting/../Bu...,AWS CodeBuild,https://spiradoc.inflectra.com/Reporting/,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
6,https://spiradoc.inflectra.com/SpiraPlan-Quick...,AWS CodeBuild,https://spiradoc.inflectra.com/SpiraPlan-Quick...,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
7,https://spiradoc.inflectra.com/About/introduct...,Activating,https://spiradoc.inflectra.com/About/introduct...,Activating TaraVault¶ Introduction¶ TaraVault®...
8,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Activating,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Activating TaraVault¶ Introduction¶ TaraVault®...
9,https://spiradoc.inflectra.com/Spira-User-Manu...,Activating,https://spiradoc.inflectra.com/Spira-User-Manual/,Activating TaraVault¶ Introduction¶ TaraVault®...


In [4]:
def split_text_if_contains_a_heading_via_question_words(text: str) -> Generator[str, None, None]:
    """
    Split text into sections if it contains a heading via question words.
    """
    import re
    word_without_period_followed_by_question_word = re.compile(r'\b(\w+)\s+([Ww]hat|[Ww]here|[Ww]hen|[Ww]hy|[Hh]ow|[Ww]ho|[Ww]hich|[Ww]hose|[Ww]hom)\b')
    r = word_without_period_followed_by_question_word.search(text)
    if r:
        idx = r.start() + text[r.start():].find(" ") + 1
        yield text[:idx-1]
        yield text[idx:]
    else:
        yield [text]

In [5]:
sentence = "This is a test What is this?"
list(split_text_if_contains_a_heading_via_question_words(sentence))

['This is a test', 'What is this?']

In [6]:
db.sql("""
create or replace table doc_id_lookup as (
with

content as (
    select content
    from docs
    order by content
),

counts as (
    select
        *,
        length(content) as length_of_content_str,
        count(*) as count_of_content
    from content
    group by all
    order by content
)

select distinct
    row_number() over () as doc_id,
    content,
    count_of_content

from counts
order by content)
""")

# para = '¶'
# x = db.sql('from doc_id_lookup').df()['content'][100]
# l = [s.strip() for s in x.replace("  ", para).split(para)]
# lcopy = l.copy()

# new_l = []
# for i, l0 in enumerate(lcopy):
#     for item in list(split_text_if_contains_a_heading_via_question_words(l0)):

#         if len(item) > 1:
#             new_l += item
#         else:
#             new_l.append(*item)

# new_lcopy = new_l.copy()
# new_new_l = []
# for i, l0 in enumerate(new_lcopy):
#     try:
#         if (
#             (l0.title().replace('In', 'in').replace('The', 'the').replace('For', 'for').replace('And', 'and') == l0) 
#             and (len(l0) > 10)
#         ):
#             new_new_l.append(f"## {l0}")

#         elif l0 in [
#             'Introduction', 'Summary', 'Conclusion', 'References', 'Acknowledgements', 'Appendix'
#         ]:
#             new_new_l.append(f"## {l0}")

#         elif (
#             len(l0.split()) < 10
#         ) or (
#             len(l0.split()) < 20 and l0[-1] in [':', '?']
#         ):
#             new_new_l.append(f"## {l0}")
#         else:
#             new_new_l.append(f"{l0}")

#     except Exception as e:
#         print(f"{l0} failed with {e}")

# new_new_l

docs = db.sql("""
with

ids as (
    select distinct doc_id, content
    from doc_id_lookup
), 

joined as (
    select distinct
        ids.doc_id,
        first(docs.link) over (partition by ids.doc_id) as link,
        first(docs.text) over (partition by ids.doc_id) as text,
        first(docs.parent) over (partition by ids.doc_id) as parent,
        first(docs.content) over (partition by ids.doc_id) as content

    from docs
    left join ids
    on docs.content = ids.content
    order by
        doc_id,
        text,
        link

)

from joined""").df()

docs.head(10)

Unnamed: 0,doc_id,link,text,parent,content
0,1,https://spiradoc.inflectra.com/HowTo-Guides/Us...,AWS CodeBuild,https://spiradoc.inflectra.com/HowTo-Guides/Us...,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
1,2,https://spiradoc.inflectra.com/About/introduct...,Activating,https://spiradoc.inflectra.com/About/introduct...,Activating TaraVault¶ Introduction¶ TaraVault®...
2,3,https://spiradoc.inflectra.com/Spira-Administr...,Keyboard Shortcuts,https://spiradoc.inflectra.com/Spira-Administr...,Appendix 1: Keyboard Shortcuts¶ SpiraPlan® inc...
3,4,https://spiradoc.inflectra.com/Spira-Administr...,Application Overview and Tips,https://spiradoc.inflectra.com/Spira-Administr...,"Application Overview¶ Spira is an easy to use,..."
4,5,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Atlassian Bamboo,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Atlassian Bamboo¶ This section outlines how to...
5,6,https://spiradoc.inflectra.com/Reporting/../Sp...,Automation Host Management,https://spiradoc.inflectra.com/Reporting/,Automation Host Management¶ Automation Host Li...
6,7,https://spiradoc.inflectra.com/Spira-User-Manu...,Available Report Tables,https://spiradoc.inflectra.com/Spira-User-Manual/,Available Custom Reports Tables¶ How to use t...
7,8,https://spiradoc.inflectra.com/Reporting/../Sp...,Azure OpenAI,https://spiradoc.inflectra.com/Reporting/,Azure OpenAI SpiraApp¶ Some of this SpiraApp'...
8,9,https://spiradoc.inflectra.com/HowTo-Guides/Us...,BDD,https://spiradoc.inflectra.com/HowTo-Guides/Us...,BDD¶ Some of this SpiraApp's functionality is...
9,10,https://spiradoc.inflectra.com/Spira-User-Manu...,BadBoy,https://spiradoc.inflectra.com/Spira-User-Manual/,BadBoy¶ Badboy is an automated website functio...


In [7]:
import spacy
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_trf")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Unnamed: 0,doc_id,link,text,parent,content
0,1,https://spiradoc.inflectra.com/HowTo-Guides/Us...,AWS CodeBuild,https://spiradoc.inflectra.com/HowTo-Guides/Us...,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
1,2,https://spiradoc.inflectra.com/About/introduct...,Activating,https://spiradoc.inflectra.com/About/introduct...,Activating TaraVault¶ Introduction¶ TaraVault®...
2,3,https://spiradoc.inflectra.com/Spira-Administr...,Keyboard Shortcuts,https://spiradoc.inflectra.com/Spira-Administr...,Appendix 1: Keyboard Shortcuts¶ SpiraPlan® inc...
3,4,https://spiradoc.inflectra.com/Spira-Administr...,Application Overview and Tips,https://spiradoc.inflectra.com/Spira-Administr...,"Application Overview¶ Spira is an easy to use,..."
4,5,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Atlassian Bamboo,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Atlassian Bamboo¶ This section outlines how to...
...,...,...,...,...,...
225,226,https://spiradoc.inflectra.com/HowTo-Guides/Us...,How to use this manual,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Welcome to the SpiraPlan User Manual¶ How to ...
226,227,https://spiradoc.inflectra.com/Spira-Administr...,Worx,https://spiradoc.inflectra.com/Spira-Administr...,WorX SpiraApp¶ This SpiraApp lets you integrat...
227,228,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Worksoft Certify,https://spiradoc.inflectra.com/HowTo-Guides/Us...,Worksoft Certify¶ Worksoft Certify is a test a...
228,229,https://spiradoc.inflectra.com/SpiraPlan-Quick...,ZAPTEST,https://spiradoc.inflectra.com/SpiraPlan-Quick...,ZAPTEST¶ ZAPTEST is a cross-platform and cross...


In [27]:
import pandas as pd
from tqdm import tqdm

def extract_entities(text: str) -> dict:
    """
    Extract entities from text.
    """
    try:
        doc = nlp(text)

        label_lists = {ent[1]:[] for ent in list({doc.ents})}
        entities = list({(
            ent.text,
            ent.label_
        ) for ent in doc.ents})

        for entity in entities:
            if entity[1] in ['ORDINAL', 'CARDINAL', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']:
                continue
            elif entity[1] in label_lists:
                label_lists[entity[1]].append(entity[0])
            else:
                label_lists[entity[1]] = [entity[0]]

        # drop any labels that do not have any entities associated with them
        label_lists = {k: v for k, v in label_lists.items() if len(v) > 0}

        return label_lists
    except Exception as e:
        return {}

extracted = docs[['doc_id', 'content']].copy()
for row in tqdm(extracted.iterrows(), total=extracted.shape[0]):
    # print(row[1])
    # break
    en = extract_entities(row[1]['content'])
    en_list = [
        {
            "doc_id": row[1]['doc_id'],
            "entity": k,
            "entity_value": v
        } for k, v in en.items()
    
    ]
    en_df = pd.DataFrame(en_list)
    en_df.to_parquet(f"./entities/entity_id_{row[1]['doc_id']}.parquet", index=False)
    del en_df
    del en_list
    del en


extracted.head()

  0%|          | 0/230 [00:00<?, ?it/s]

100%|██████████| 230/230 [49:48<00:00, 13.00s/it]  


Unnamed: 0,doc_id,content
0,1,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT..."
1,2,Activating TaraVault¶ Introduction¶ TaraVault®...
2,3,Appendix 1: Keyboard Shortcuts¶ SpiraPlan® inc...
3,4,"Application Overview¶ Spira is an easy to use,..."
4,5,Atlassian Bamboo¶ This section outlines how to...


In [29]:
pd.read_parquet("./entities/entity_id_1.parquet")

Unnamed: 0,doc_id,entity,entity_value
0,1,PRODUCT,"[SpiraPlan, CodeBuild, Spira, SpiraTeam, Spira..."
1,1,ORG,"[Spira, CodeBuild, SNS, AWS CodeBuild, AWS]"


In [39]:
db.sql("""
with
raw as (
    from read_parquet('./entities/*.parquet')
),

unique_entity_types as (
    select distinct entity
    from raw
)

from raw
""")

┌────────┬─────────────┬───────────────────────────────────────────────────────────────────────────────────────────────┐
│ doc_id │   entity    │                                         entity_value                                          │
│ int64  │   varchar   │                                           varchar[]                                           │
├────────┼─────────────┼───────────────────────────────────────────────────────────────────────────────────────────────┤
│      1 │ PRODUCT     │ [SpiraPlan, CodeBuild, Spira, SpiraTeam, SpiraTest, SNS]                                      │
│      1 │ ORG         │ [Spira, CodeBuild, SNS, AWS CodeBuild, AWS]                                                   │
│     10 │ PRODUCT     │ [RemoteLaunch, SpiraTest, Linked, BadboyX, the Badboy Engine¶, SpiraTest/Team, BadBoy¶, the…  │
│     10 │ ORG         │ [Windows, Integration, Inflectra, SpiraTeam, Badboy, RemoteLaunch]                            │
│    100 │ PRODUCT     │ [OctoPe

In [None]:
db.sql("""
create or replace table extracted as (
    from extracted
)
""")

db.sql('from extracted').pl()

In [43]:
# df = pd.read_csv("./spira_qna_maker.tsv", delimiter="\t")
# df.columns = ['question', 'answer', 'source', 'metadata', 'suggested_questions', 'is_context_only', 'prompts', 'qa_id']
# updf = df[['qa_id', 'question', 'answer']].fillna('')
updf = db.sql('from upload').pl()
updf

qa_id,question,answer
i64,str,str
1,"""What is SpiraTeam?""","""SpiraPlan is an Application Li…"
1,"""What is Spira?""","""SpiraPlan is an Application Li…"
1,"""What is SpiraPlan?""","""SpiraPlan is an Application Li…"
2,"""How do I Request Access to Spi…","""All new Spira users need to su…"
2,"""How do I get Spira""","""All new Spira users need to su…"
…,…,…
179,"""How do I customize a workflow?""","""While the process to update th…"
179,"""Can I customize the artifact w…","""While the process to update th…"
180,"""Value missing in custom proper…","""Custom properties can be popul…"
180,"""List value blank""","""Custom properties can be popul…"


In [50]:
db.close()

# documents[:5]

In [31]:
# import duckdb
# db = duckdb.connect(database='db.duckdb')
# db.sql("create table upload as (from updf)")

In [9]:
upload = [
    {
        'id': str(i),
        'question': str(json['question'] if len(json['question']) > 0 else ''),
        'answer': str(json['answer'] if len(json['answer']) > 0 else '')
    }
    for i, json in enumerate(updf.to_dict(orient='records'))
]
upload[-2:]

[{'id': '282',
  'question': 'List value blank',
  'answer': 'Custom properties can be populated with a list of custom values (as opposed to a single date, boolean, or text value). A value in the list can be deactivated without deactivating the entire list itself. If this happens, the deactivated value will no longer display in the detail view of the affected artifacts. The value will continue to display in the list view.\\n\\nFor an illustration of the issue, see [this post](https://teams.microsoft.com/l/message/19:8d7c611c16884bfeb4a952a427452251@thread.skype/1667473265479?tenantId=0f087cc2-bc5d-40bf-8db4-41f99d0d1619&groupId=f9fb4dcc-4bd7-4b02-be38-a325f73fe3e0&parentMessageId=1667473265479&teamName=Spira%20Training%20and%20Support&channelName=Support&createdTime=1667473265479&allowXTenantAccess=false) from the Spira Support channel.'},
 {'id': '283',
  'question': 'Required field blank',
  'answer': 'Custom properties can be populated with a list of custom values (as opposed to a s

connected to blob client

 -> upload pdf files & maybe they can be indexed

In [138]:
# class IndexSearchClient():
#     """Class to interact with Azure AI Search Directly"""
#     def __init__(self,target_index=None) -> None:
#         self.key = azure_search_key
#         if target_index is None:
#             self.search_client = SearchClient(
#                 endpoint=azure_search_endpoint,
#                 index_name=azure_search_index_name,
#                 credential=AzureKeyCredential(self.key)
#             )
#         else:
#             self.search_client = SearchClient(
#                 endpoint=azure_search_endpoint,
#                 index_name=target_index,
#                 credential=AzureKeyCredential(self.key)
#             )
#         self.index_admin_client=SearchIndexClient(
#             endpoint=azure_search_endpoint,
#             credential=AzureKeyCredential(self.key)
#         )
    
#     def _get_results_iter(self) -> Generator[list, Any, None]:
#         response = self.search_client.search(search_text="*", top=100000).by_page()
#         for page in response:
#             page = list(page)
#             yield page

#     def search_index(self,query:str, fields=["content"]) -> str:
#         """Keyword Search of top 3"""
#         results = self.search_client.search(
#             search_text = query,
#             vector_queries=None,
#             select=fields,
#             top=3
#         )
#         content =''
#         for i in fields:
#             _content = [x.get(i) for x in results]
#             new = "\n\n".join(_content)
#             content= f'{content}\n{new}'
#         return content

#     def upload(self,docs:list[dict]) -> None:
#         """Upload docs to index"""
#         self.search_client.upload_documents(docs)
    
#     def get_all_index_data(self) -> list[dict]:
#         results = list()
#         _index = self._get_results_iter()
#         for page in _index:
#             for content in page:
#                 _del_keys = [x for x in content.keys() if x.find('@') != -1]
#                 for key in _del_keys:
#                     del content[key]
#                 results.append(content)

#         return results


In [51]:
# isc = IndexSearchClient()
# isc.get_all_index_data()

In [143]:
db.sql('from upload')

┌───────┬──────────────────────┬───────────────────────────────────────────────────────────────────────────────────────┐
│ qa_id │       question       │                                        answer                                         │
│ int64 │       varchar        │                                        varchar                                        │
├───────┼──────────────────────┼───────────────────────────────────────────────────────────────────────────────────────┤
│     1 │ What is SpiraTeam?   │ SpiraPlan is an Application Lifecycle Management (ALM) tool. Read how to get starte…  │
│     1 │ What is Spira?       │ SpiraPlan is an Application Lifecycle Management (ALM) tool. Read how to get starte…  │
│     1 │ What is SpiraPlan?   │ SpiraPlan is an Application Lifecycle Management (ALM) tool. Read how to get starte…  │
│     2 │ How do I Request A…  │ All new Spira users need to submit an Access Request from the IT Request Center. Th…  │
│     2 │ How do I get Spira   │

In [None]:
isc.upload()

In [52]:
# #*********************************************************************#
# # adjust "contentFields" to mirror the fields created in your index.
# # be sure to edit the index_name to reference your project-specific index.
# # your project may not need url or titleField. It will depend on whether you have put something similar in your index.
# ## IF YOU GET A 404 ERROR: Try replacing the authentication with "system_assigned_managed_identity"
# #*********************************************************************#

# class GPTParams:
#     azure_endpoint= "your azure openai endpoint" # - COMING SOON
#     api_key="your azure openai key" # - COMING SOON
#     api_version="your azure openai version" # you will create this
#     deployment_name = "your azure openai deployment" # - you will create this. will need to be gpt-4o


#     extra_body = dict(
#         data_sources = [dict(
#             type="azure_search",
#             parameters=dict(
#                 endpoint=azure_search_endpoint,
#                 index_name="your index name", ## your index name created above
#                 semantic_configuration="default", # Your Sematic Config Name
#                 # query_type="semantic", # should not need for this use case
#                 # filter="category eq 'CinciSource' or category eq 'CLD State Fact Sheets' or category eq 'CLD Procedures Manual'", # Needed to filter on Keyword filed
#                 topNDocuments=3,
#                 strictness=3,
#                 fieldsMapping=dict(
#                     contentFieldsSeparator="\n",
#                     contentFields=["content", "category"], #Dependent on the Index, this is project specific
#                     filepathField="url", #Dependent on the Index, this is project specific
#                     titleField="title", #Dependent on the Index, this is project specific
#                     vectorFields=[]
#                 ),
#                 authentication=dict(
#                     type="system_assigned_managed_identity"
#                     # type="api_key",
#                     #key=os.getenv("AZURE_KEY")
#                 )
#             )
#         )]
#     )

In [53]:

# class AiSearch():
#     def __init__(self):
#         self.messages=[]
#         self._start_client()

#     def _format_response(self,data)-> dict[str, list[dict]]:
#         response = data["choices"][0].get("message").get("content")
#         _citations = data["choices"][0].get("message").get("context").get("citations")
#         citations = [{k: v for k, v in d.items() if k != 'content'} for d in _citations]
#         return dict(
#             response=response,
#             citations=citations,
#             usage=dict(

#             )
#         )

#     def _start_client(self):
#         """
#         token_provider = get_bearer_token_provider(
#             DefaultAzureCredential(),
#             "https://cognitiveservices.azure.com/.default"
#         )
#         """
#         self.client = AzureOpenAI(
#         azure_endpoint=GPTParams.azure_endpoint,
#         #azure_ad_token_provider=token_provider,
#         api_key=GPTParams.api_key, # Updates to Azure 07/08 Broke API KEY Auth
#         api_version=GPTParams.api_version
#     )
 
#     def _create_message(self,query):
#         msg = dict(
#             role="user",
#             content=query
#         )
#         self.messages.append(msg)

#     def gpt_search(self, query) -> dict[str, object]:
#         self._create_message(query)
#         completion = self.client.chat.completions.create(
#             model=GPTParams.deployment_name,
#             messages=self.messages
#         )
#         return completion.to_dict()

#     def ai_search(self,query) -> dict[str, object]:
#         self._create_message(query)
#         completion = self.client.chat.completions.create(
#             model=GPTParams.deployment_name,
#             messages=[x for x in self.messages],
#             extra_body=GPTParams.extra_body
#         )
#         return self._format_response(completion.to_dict())

In [55]:
# # TEST YOUR CODE


# query = "how many pages is this document?" # insert your question here.
# client = AiSearch()
# pprint(client.gpt_search(query))
# #pprint(client.ai_search(query))
