# Scrape Documents from Confluence

In [54]:
from jinja2.loaders import split_template_path
from langchain_community.document_loaders import ConfluenceLoader
from langchain_community.document_loaders.confluence import ContentFormat

loader = ConfluenceLoader(
    url="https://openimis.atlassian.net/wiki/",
    space_key="OP",
    include_comments=False,
    include_attachments=False,
    include_restricted_content=False,
    max_pages=2 * 1000,
    limit=100,
    content_format=ContentFormat.ANONYMOUS_EXPORT_VIEW,
    keep_markdown_format=True,
    keep_newlines=True
)

In [55]:
docs = loader.load()

In [59]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)


In [60]:
from pprint import pprint
def display(idx: int):
    pprint(docs[idx].metadata)
    print("\n\n")
    print(docs[idx].page_content)

In [61]:
def pretty_print(chunks):
    print(
        str('\n' + '='*50 + '\n').join(
            [
                chunk.page_content + '\n' +'-'*50 + '\n' + str(chunk.metadata) 
                for chunk in chunks
            ]
        )
    )

In [62]:
display(2)

{'id': '29786113',
 'source': 'https://openimis.atlassian.net/wiki/spaces/OP/pages/29786113/2018-03-09+Meeting+notes',
 'title': '2018-03-09 Meeting notes',
 'when': '2022-12-21T15:52:03.475Z'}




## Date

09 Mar 2018

## Attendees

* [Siddharth Srivastava](https://openimis.atlassian.net/wiki/people/5a8e2f51d481603598ad50f9?ref=confluence)
* Alexandre
* Saurav
* Alicia

## Goals

* Server
* Issue queue
* Wiki page
* Installation Manual
* Next call

## Discussion items

| Time | Item | Who | Notes |
| --- | --- | --- | --- |
|  | * Server * Issue queue * Wiki page * readthedocs * Installation manual * Next call |  | * Testing and dev instances to be placed on one server * Refreshing data on Demo version to be done per week to allow time for users to replicate their issues on the demo version * Provider is being approach by Saurav to remove restrictions on server to allow demo version to get operationalized * Jira issue queue:    + Split into two: Service desk and IT team management que

In [63]:
result_docs = []
for doc in docs:
    md_doc = markdown_splitter.split_text(doc.page_content)
    for i in range(len(md_doc)):
        md_doc[i].metadata = md_doc[i].metadata | doc.metadata
    result_docs.extend(md_doc)
    
len(result_docs)

6956

In [64]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=50,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

split_result_docs = splitter.split_documents(result_docs)

  separators=["\n\n", "\n", "(?<=\. )", " ", ""]


In [66]:
len(split_result_docs)

7319

In [67]:
split_result_docs[-1]

Document(metadata={'title': 'Session II: Beneficiary management in health & social protection', 'id': '4162682951', 'source': 'https://openimis.atlassian.net/wiki/spaces/OP/pages/4162682951/Session+II+Beneficiary+management+in+health+social+protection', 'when': '2024-12-02T16:19:03.846Z'}, page_content="The latest release of openIMIS demonstrates its evolution into a versatile digital public good, supporting activities across the entire social protection delivery chain, including beneficiary registration, enrollment, benefit management, and payments. This progress reflects strategic collaboration with the World Bank's CORE-MIS platform, resulting in integrated functionalities for managing cash transfers and public works programs. With a modular, interoperable design, openIMIS can adapt to diverse program needs and contribute to Digital Public Infrastructure, including the development of a comprehensive social registry.  \nOver the last months the initiative started to gather requiremen

# Document Formatting & Chunking

In [68]:
# Archive the scraped docs
import pickle

with open("data.pkl", "wb") as fp:
    pickle.dump(split_result_docs, fp)

In [69]:
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

import os
import dotenv

In [70]:
dotenv.load_dotenv()

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [75]:
index = os.environ["PINECONE_INDEX_NAME"]
pinecone_api_key = os.environ["PINECONE_API_KEY"]

pc = Pinecone(api_key=pinecone_api_key)

In [76]:
pc.list_indexes()

{'indexes': []}

In [77]:
if index not in [idx["name"] for idx in pc.list_indexes()]:
    pc.create_index(name=index, dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [78]:
pc.describe_index(index)

{'deletion_protection': 'disabled',
 'dimension': 1536,
 'host': 'damg7374-project-index-2byjwrh.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'damg7374-project-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [79]:
pc_index = pc.Index(index)

In [80]:
vector_store = PineconeVectorStore(index=pc_index, embedding=embeddings)
vector_store.add_documents(documents=split_result_docs)

['b23086ce-99a7-4b1d-9aec-069ff2804d49',
 '69b0ea7e-16ac-4f62-91c6-7fbe5345426a',
 '51fbec61-5571-4f4b-ba91-2163b760a847',
 '2df4dea8-3f03-469e-ac5e-e8240d2d9903',
 'f2feb251-1bd2-48f4-b36a-b93e72a6a132',
 '75f3d213-a89e-44e4-9e7e-61630c198444',
 '10b45099-7bba-4e96-bff5-bcadc1e74695',
 '6888af69-e2e2-41ba-a81a-2c9e3c328651',
 'a8865faa-7317-46ec-8c75-17fe420ee9bd',
 '9b7f449a-6e55-45fa-b0b8-6f2b05082931',
 '380fdb1b-c0f5-477d-b00f-af5e6ac3971f',
 'cb15d2a2-c93f-460d-ae85-2c12ded7290e',
 '8cdafa6b-7af3-45e4-ba31-690959595491',
 '58514844-dbf1-4ed2-a13b-0efd21bca677',
 'd20adc7c-c2e7-4ac0-a274-d00865cefdaa',
 '6e796dee-1209-479d-899d-9350e9d4f529',
 '82dc8d60-05a7-43fb-a3fc-ac5fbfb6d43e',
 'c07a5e5c-ce3f-4ae5-b9f7-8752d81ef1a7',
 '2cdafa4d-3692-4589-a476-a55e36e1d156',
 '9d44058b-36d1-4b98-9cc7-f764d6e3b3f9',
 'b18ea586-ad82-4be5-b2fb-592d2dbb9d81',
 '20491f7c-43cc-49f9-8b8f-61142ef75c0d',
 'feaccb03-9efd-41ae-900e-908416ab4245',
 'f9be9086-2f2c-4481-84b6-85a1a46b1b9a',
 'df019072-1fc0-