In [2]:
%pip install pypdf

Note: you may need to restart the kernel to use updated packages.


In [34]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
import re
import os

data_path = '../../data/UNVXIS.05 Enable Internal Sales Orders.pdf'

loader = PyPDFLoader(data_path)

pages = []
async for page in loader.alazy_load():
    pages.append(page)

def preprocess_content(content):
    # Remove the specific phrase from the content
    return re.sub(r'Enable Internal Sales Orders \(Internal Sales Orders\) \d+ \(\d+\)', '', content)

def split_by_headers(documents, header_pattern, specific_headers):
    split_docs = []
    last_seen_header = None
    for doc in documents:
        # Preprocess the content to remove the specific phrase
        content = preprocess_content(doc.page_content)
        chunks = re.split(f'({header_pattern})', content)
        chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
        for chunk in chunks:
            if chunk in specific_headers:
                last_seen_header = chunk
            else:
                split_docs.append(
                    Document(
                        page_content=chunk,
                        metadata={'header': last_seen_header}
                    )
                )
    return split_docs

# Define the specific headers you want to capture
specific_headers = ["Overview", "Tasks", "Setup", "Additional Information"]

# Define the header pattern (adjust the pattern based on your headers)
header_pattern = r'\b(?:' + '|'.join(specific_headers) + r')\b'

# Split the document by headers
docs = split_by_headers(pages, header_pattern, specific_headers)

# Print the first chunk's content and metadata
docs = docs[2:]

for doc in docs:
    print(doc.page_content)
    print(doc.metadata)
    print('-------------------')

Concept 
 
Process  
This configuration process explains how the system must be set up to enable internal sales 
orders. One major consideration is the Goods-in-transit (GIT) management. As explained in 
the detailed steps, this is controlled by a parameter in ‘Settings – Cost Accounting’ 
(CAS900), which is set Off. 
The project team should decide whether to activate this, as it will have an impact on internal 
sales orders accounting and “external”, or “regular” customer orders and purchase orders. 
Although the Goods-in-transit functionality has not been activated, the process descriptions 
assume that it has been. 
Input  
 
Output  
The settings and basic data are in place for using internal sales orders. 
The project team has chosen whether to activate the Goods-in-transit (GIT) functionality. 
Depending on this, GIT will be managed or not during the internal sales orders process.
{'header': 'Overview'}
-------------------
Manage General Settings 
1. Activate Internal Sales Order

In [35]:
from dotenv import load_dotenv
load_dotenv()

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ASTRA_DB_API_KEY = os.getenv('ASTRA_DB_API_KEY')
ASTRA_DB_ENDPOINT = os.getenv('ASTRA_DB_ENDPOINT')
ASTRA_DB_KEYSPACE = os.getenv('ASTRA_DB_KEYSPACE')

model = ChatOpenAI(model='gpt-4o')

embeddings = OpenAIEmbeddings()

In [37]:
from langchain_community.vectorstores import AstraDB

vectorstore = AstraDB(
    embedding=embeddings,
    token=ASTRA_DB_API_KEY,
    api_endpoint=ASTRA_DB_ENDPOINT,
    collection_name="not_metadata",
)

INFO     [astrapy.core.db] ASTRA_DB_KEYSPACE is not set. Defaulting to 'default_keyspace'
INFO     [astrapy.core.db] ASTRA_DB_KEYSPACE is not set. Defaulting to 'default_keyspace'


In [38]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [39]:
vectorstore.add_documents(documents=docs, doc_ids=doc_ids)



['ca3a4025f5f84fbc90878e2d544663bb',
 '7bfa5cf53fb8445bbbdcf0ddf18a7378',
 '960909173aee4178b228f0f62e587b58',
 '37be0e2d2a38412595d307afcc00f34a',
 '78b9b4dc54904f8eb53e32d1cd3556d6',
 '9466fd9c277e4519a35e137106dec01a',
 'e291963aa39b4d6cb25a486a4c44645e',
 '3a16eb75c281439d8935cc186a3a9e03',
 'c2441cdd494640c1b0970e24f52b2000',
 '5f4cc41a082b4be3b298df0d1aa09593',
 '99676bb9530e485aa9a48dc9f7e185f7']

In [40]:
import os
from astrapy import DataAPIClient

# Initialize the client and get a "Database" object
client = DataAPIClient(ASTRA_DB_API_KEY)
database = client.get_database(ASTRA_DB_ENDPOINT)
collection = database.get_collection("not_metadata")

# Retrieve all documents from the collection
documents = collection.find({})

# Print the retrieved documents
for doc in documents:
    print(doc["metadata"])
    print('-------------------')

INFO     [astrapy.cursors] creating iterator on 'not_metadata'
INFO     [astrapy.cursors] finished creating iterator on 'not_metadata'
INFO     [astrapy.collection] command=find on 'not_metadata'
INFO     [astrapy.collection] finished command=find on 'not_metadata'
{'header': 'Additional Information'}
-------------------
{'header': 'Tasks'}
-------------------
{'header': 'Tasks'}
-------------------
{'header': 'Overview'}
-------------------
{'header': 'Tasks'}
-------------------
{'header': 'Tasks'}
-------------------
{'header': 'Tasks'}
-------------------
{'header': 'Tasks'}
-------------------
{'header': 'Tasks'}
-------------------
{'header': 'Additional Information'}
-------------------
{'header': 'Setup'}
-------------------


In [None]:
  '''
    AttributeInfo(
        name="source",
        description="The source file path of the document",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page number of the document",
        type="integer",
    ),

    '''

In [None]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="header",
        description="The header of the document page. One of ['overview', 'tasks', 'setup', 'additional information']",
        type="string",
    ),
]

document_content_description = "Content of the headers in the metadata"

retriever = SelfQueryRetriever.from_llm(
    model, vectorstore, document_content_description, metadata_field_info, verbose=True
)

results = retriever.invoke(
    "Can you retrieve all the documents for the tasks?"
)

# Print the results
for result in results:
    print(result.page_content)
    print(result.metadata)
    print('-------------------')

In [33]:
retriever.invoke(
    "Can you retrieve all the text under the header called Setup ?"
)

[]

In [15]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)
output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | model | output_parser

response = query_constructor.invoke(
    {
        "query": "Can you retrieve all the documents for the tasks?"
    }
)

In [16]:
response

StructuredQuery(query=' ', filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='header', value='tasks'), limit=None)

In [13]:
print(prompt.format(query="dummy question"))

Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

```json
{
    "query": string \ text string to compare to document contents
    "filter": string \ logical condition statement for filtering documents
}
```

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: `comp(attr, val)`:
- `comp` (eq | ne | gt | gte | lt | lte | contain | like | in | nin): comparator
- `attr` (string):  name of attribute to apply the comparison to
- `val` (string): is the comparison value

A logical operation statement takes the form `op(statement1, statement2, ...)`:
- `op` (and | or | not