In [160]:
from langchain.document_loaders import RecursiveUrlLoader, SitemapLoader
from bs4 import BeautifulSoup, Doctype, NavigableString, Tag, SoupStrainer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX

from typing import Optional, Generator
import re
from pprint import pprint
import joblib

In [2]:
URL_LIST = [
    "https://spiradoc.inflectra.com/Spira-User-Manual/",
    "https://spiradoc.inflectra.com/HowTo-Guides/Users-orientation/",
    "https://spiradoc.inflectra.com/SpiraPlan-Quick-Start-Guide/",
    "https://spiradoc.inflectra.com/Spira-Administration-Guide/",
    "https://spiradoc.inflectra.com/SpiraApps/",
    "https://spiradoc.inflectra.com/Reporting/",
    "https://spiradoc.inflectra.com/About/introduction-to-spira/"
]

In [97]:
import json
import pandas as pd

df = pd.read_csv("./spira_qna_maker.tsv", sep="\t")[['QnaId', 'Question', 'Answer']]
for i, row in enumerate(df.iterrows()):
    d = {
        "id": row[1]['QnaId'],
        "question": row[1]['Question'],
        "answer": row[1]['Answer']
    }

    with open(f"./rows/row_{i}.json", "w") as f:
        json.dump(d, f)

# df.assign(len=df.Answer.apply(lambda x: len(x))).sort_values('len', ascending=False).head(1)

In [153]:
import duckdb
db = duckdb.connect("data\db.duckdb")
db.sql("""
create or replace table entities as (
    select 
        doc_id,
        entity,
        --entity_value as value
        reduce(entity_value, (x, y) -> x || ' | ' || y) as value

    from read_parquet('data/entities/*.parquet')
    where entity not in (
        'WORK_OF_ART'
    )
    order by doc_id
)
    
""")

db.sql('from entities limit 5')

┌────────┬─────────┬───────────────────────────────────────────────────────────────────────────────────────────────────┐
│ doc_id │ entity  │                                               value                                               │
│ int64  │ varchar │                                              varchar                                              │
├────────┼─────────┼───────────────────────────────────────────────────────────────────────────────────────────────────┤
│      1 │ PRODUCT │ SpiraPlan | CodeBuild | Spira | SpiraTeam | SpiraTest | SNS                                       │
│      1 │ ORG     │ Spira | CodeBuild | SNS | AWS CodeBuild | AWS                                                     │
│      2 │ PRODUCT │ TaraVault¶ | SpiraPlan | Spira | SpiraTeam | Subversion | Administration | SpiraPlan® | SpiraTe…  │
│      2 │ ORG     │ Inflectra | SpiraPlan | TaraVault                                                                 │
│      2 │ LAW     │ the User Ma

In [154]:
spira_docs_df = db.sql("""
with

doc_ids as (
    select doc_id, content
    from doc_id_lookup
),

join_docs as (
    select
        doc_ids.doc_id,
        doc_ids.content,
        docs.* exclude (content)

    from doc_ids
    left join docs
    on doc_ids.content = docs.content
),

join_entities as (
    select
        join_docs.doc_id,
        join_docs.text as title,
        join_docs.content,
        first(join_docs.link) over (partition by join_docs.doc_id, entities.entity, entities.value) as url,
        entities.entity,
        entities.value

    from join_docs
    left join entities
    on join_docs.doc_id = entities.doc_id

    -- where url not like '%/../%'
)

select distinct * from join_entities order by doc_id, entity
""").df()
print(spira_docs_df.shape)
spira_docs_df.head()

(573, 6)


Unnamed: 0,doc_id,title,content,url,entity,value
0,1,AWS CodeBuild,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT...",https://spiradoc.inflectra.com/HowTo-Guides/Us...,ORG,Spira | CodeBuild | SNS | AWS CodeBuild | AWS
1,1,AWS CodeBuild,"AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraT...",https://spiradoc.inflectra.com/HowTo-Guides/Us...,PRODUCT,SpiraPlan | CodeBuild | Spira | SpiraTeam | Sp...
2,2,Activating,Activating TaraVault¶ Introduction¶ TaraVault®...,https://spiradoc.inflectra.com/About/introduct...,LAW,the User Manual
3,2,Activating,Activating TaraVault¶ Introduction¶ TaraVault®...,https://spiradoc.inflectra.com/About/introduct...,ORG,Inflectra | SpiraPlan | TaraVault
4,2,Activating,Activating TaraVault¶ Introduction¶ TaraVault®...,https://spiradoc.inflectra.com/About/introduct...,PRODUCT,TaraVault¶ | SpiraPlan | Spira | SpiraTeam | S...


In [164]:
for _, row in spira_docs_df.iterrows():
    content = [row['content']]
    while any(len(c) > 2000 for c in content):
        new_content = []
        for c in content:
            if len(c) > 2000:
                new_content.append(c[:2000])
                new_content.append(c[1500:])
            else:
                new_content.append(c)
        content = new_content

    for i, c in enumerate(content):
        d = {
            "id": row['doc_id'],
            "title": row['title'],
            "content": c,
            "entity": row['entity'],
            "value": row['value'],
            "url": row['url']
        }

        with open(f"./data/docs/{row['doc_id']}_entity_{row['entity']}_pt_{i}.json", "w") as f:
            json.dump(d, f)

In [120]:
db.sql('from doc_id_lookup')

┌────────┬──────────────────────────────────────────────────────────────────────────────────────────┬──────────────────┐
│ doc_id │                                         content                                          │ count_of_content │
│ int64  │                                         varchar                                          │      int64       │
├────────┼──────────────────────────────────────────────────────────────────────────────────────────┼──────────────────┤
│      1 │ AWS CodeBuild¶ Introduction¶ SpiraTest, SpiraTeam, and SpiraPlan (from here on called …  │                7 │
│      2 │ Activating TaraVault¶ Introduction¶ TaraVault® is the secure source code and file host…  │                7 │
│      3 │ Appendix 1: Keyboard Shortcuts¶ SpiraPlan® includes an array of keyboard shortcuts to …  │                7 │
│      4 │ Application Overview¶ Spira is an easy to use, quick to configure, application designe…  │                7 │
│      5 │ Atlassian Bamboo¶ Thi

In [117]:
db.sql('from entities')

┌────────┬─────────┬────────────────┐
│ doc_id │ entity  │     value      │
│ int64  │ varchar │    varchar     │
├────────┼─────────┼────────────────┤
│      1 │ PRODUCT │ SpiraPlan      │
│      1 │ PRODUCT │ CodeBuild      │
│      1 │ PRODUCT │ Spira          │
│      1 │ PRODUCT │ SpiraTeam      │
│      1 │ PRODUCT │ SpiraTest      │
│      1 │ PRODUCT │ SNS            │
│      1 │ ORG     │ Spira          │
│      1 │ ORG     │ CodeBuild      │
│      1 │ ORG     │ SNS            │
│      1 │ ORG     │ AWS CodeBuild  │
│      · │  ·      │       ·        │
│      · │  ·      │       ·        │
│      · │  ·      │       ·        │
│    230 │ ORG     │ Administration │
│    230 │ ORG     │ Zendesk¶       │
│    230 │ ORG     │ Spirateam      │
│    230 │ PRODUCT │ Zendesk        │
│    230 │ PRODUCT │ SpiraTeam®     │
│    230 │ PRODUCT │ Spira          │
│    230 │ PRODUCT │ SpiraTeam App  │
│    230 │ PRODUCT │ SpiraTeam      │
│    230 │ PRODUCT │ SpiraTeam¶     │
│    230 │ P

In [100]:
db.sql('from docs')

┌──────────────────────┬────────────────┬──────────────────────┬───────────────────────────────────────────────────────┐
│         link         │      text      │        parent        │                        content                        │
│       varchar        │    varchar     │       varchar        │                        varchar                        │
├──────────────────────┼────────────────┼──────────────────────┼───────────────────────────────────────────────────────┤
│ https://spiradoc.i…  │ API Overview   │ https://spiradoc.i…  │ Using the Spira REST API¶ Overview¶ Spira has exten…  │
│ https://spiradoc.i…  │ API Overview   │ https://spiradoc.i…  │ Using the Spira REST API¶ Overview¶ Spira has exten…  │
│ https://spiradoc.i…  │ API Overview   │ https://spiradoc.i…  │ Using the Spira REST API¶ Overview¶ Spira has exten…  │
│ https://spiradoc.i…  │ API Overview   │ https://spiradoc.i…  │ Using the Spira REST API¶ Overview¶ Spira has exten…  │
│ https://spiradoc.i…  │ API Ove

In [31]:
import requests
req = requests.get("https://spiradoc.inflectra.com/Spira-User-Manual/", verify=False)
soup = BeautifulSoup(req.text, 'html.parser')
soup




<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="https://spiradoc.inflectra.com/Spira-User-Manual/" rel="canonical"/>
<link href="Functionality-Overview/" rel="next"/>
<link href="../images/favicon.ico" rel="icon"/>
<meta content="mkdocs-1.6.0, mkdocs-material-9.5.19+insiders-4.53.8" name="generator"/>
<title>How to use this manual - SpiraDocs</title>
<link href="../assets/stylesheets/main.d5b5f0fd.min.css" rel="stylesheet"/>
<link href="../assets/stylesheets/palette.ab4e12ef.min.css" rel="stylesheet"/>
<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
<link href="https://fonts.googleapis.com/css?family=Nunito+Sans:300,300i,400,400i,700,700i%7CIBM+Plex+Mono:400,400i,700,700i&amp;display=fallback" rel="stylesheet"/>
<style>:root{--md-text-font:"Nunito Sans";--md-code-font:"IBM Plex Mono"}</style>
<link href="../stylesheets/spira.css" rel="stylesheet"/>
<

In [61]:
from typing import List, Dict
def get_one_file(url:str) -> List[Dict[str, str]]:
    req = requests.get(url, verify=False)
    soup = BeautifulSoup(req.text, 'html.parser')
    className="md-nav__link"
    return [
        {
            "href": a["href"],
            'link': f"{url}" + (a["href"][2:] if a["href"].startswith("./") else a["href"]),
            'parent': url,
            'text': a.text.strip()
        }

        for i, a in
        enumerate(soup.findAll("a", {"class": className}))
]

In [62]:
links = []
for url in URL_LIST:
    links += get_one_file(url)

links[:10]



[{'href': './',
  'link': 'https://spiradoc.inflectra.com/Spira-User-Manual/',
  'parent': 'https://spiradoc.inflectra.com/Spira-User-Manual/',
  'text': 'How to use this manual'},
 {'href': 'Functionality-Overview/',
  'link': 'https://spiradoc.inflectra.com/Spira-User-Manual/Functionality-Overview/',
  'parent': 'https://spiradoc.inflectra.com/Spira-User-Manual/',
  'text': 'Application Overview and Tips'},
 {'href': 'Application-Wide/',
  'link': 'https://spiradoc.inflectra.com/Spira-User-Manual/Application-Wide/',
  'parent': 'https://spiradoc.inflectra.com/Spira-User-Manual/',
  'text': 'Common Elements Across the Application'},
 {'href': 'User-Product-Management/',
  'link': 'https://spiradoc.inflectra.com/Spira-User-Manual/User-Product-Management/',
  'parent': 'https://spiradoc.inflectra.com/Spira-User-Manual/',
  'text': 'User Management'},
 {'href': 'Product-Homepage/',
  'link': 'https://spiradoc.inflectra.com/Spira-User-Manual/Product-Homepage/',
  'parent': 'https://spirad



'\nWelcome to the SpiraPlan User Manual¶\n\nHow to use this manual\nThis documentation is designed for all users of SpiraTest, SpiraTeam, or SpiraPlan.\nIt can be read \'cover to cover\' or you can dip into a specific section for key information.   \nTo find the section you need, open the "User Manual" section from the site navigation to see all available chapters.\nThis manual is built around a few core areas:\n\nAn [overview of the functionality(./Functionality-Overview.md)\nYour [user profile and home page(./User-Product-Management.md)\nFeatures [common to many parts of the application(./Application-Wide.md)\nInformation about accessing the core data in SpiraPlan - which we store in areas called "Workspaces" Workspaces are hierarchical. Most data is stored in [products(./Product-Homepage.md). Products are grouped together in [programs(./Program-Homepage.md). Programs are grouped together in [portfolios(./Portfolio-Homepage.md). Portfolios are all grouped under the [enterprise(./Ente

In [77]:
from tqdm import tqdm
for i, link in tqdm(enumerate(links)):
    req = requests.get(link['link'], verify=False)
    soup = BeautifulSoup(req.text, 'html.parser')
    links[i]['content'] = soup.find('article').text.replace("\n", " ").replace("    ", " ").strip()

1676it [17:57,  1.56it/s]


In [82]:
links[5]['link']

'https://spiradoc.inflectra.com/Spira-User-Manual/Requirements-Management/'

In [83]:
links[5].keys()

dict_keys(['href', 'link', 'parent', 'text', 'content'])

In [84]:
link_dict ={
    link['link']: {
        'text': link['text'],
        'parent': link['parent'],
        'content': link['content']
    }
    for link in links
}

In [None]:
req = requests.get("https://itcustomersupportkbsg.blob.core.windows.net/navigator-storage/spira_docs.json", verify=False)


In [86]:
import json
with open('links.json', 'w') as f:
    json.dump(link_dict, f)

In [28]:
print(userman['page_content'])

How to use this manual - SpiraDocs

          Skip to content
        

            SpiraDocs
          

            
              How to use this manual
            
          

            Initializing search
          

    Inflectra
  

    SpiraDocs
  

    Inflectra
  

    
  
    User Manual
  

    
  

            
  
    User Manual
  

          

    
  
    How to use this manual
  

    
  

    
  
    Application Overview and Tips
  

    
  

    
  
    Common Elements Across the Application
  

    
  

    
  
    User Management
  

    
  

    
  
    Products
  

    
  

            
  
    Products
  

          

    
  
    Homepage
  

    
  

    
  
    Planning
  

    
  

            
  
    Planning
  

          

    
  
    Requirements Management
  

    
  

    
  
    Release Management
  

    
  

    
  
    Document Management
  

    
  

    
  
    Planning Board
  

    
  

    
  
    Testing
  

    
  

            
  
    Testi