In [38]:
import os
from pymongo import MongoClient
from typing import List, Dict

MONGO_URI = os.getenv("MONGODB_URI")
DB_NAME = os.getenv("MONGODB_DB_NAME")
collection_name = 'si'
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "containergenie.ai"
os.environ['USER_AGENT'] = 'chapter2-1'

####################################################################################

from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import create_openai_functions_agent, AgentExecutor

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

###################################################################################

# block included to check whether the whole chain works out or not
def fetch_data_from_mongodb(collection_name: str, query: Dict = None, limit: int = None) -> List[Dict]:
 
    client = MongoClient(MONGO_URI)
    db = client[DB_NAME]
    collection = db[collection_name]
    
    # Prepare the find operation
    find_operation = collection.find(query) if query else collection.find()
    
    # Fetch and return the data
    data = list(find_operation)
    
    # Close the connection
    client.close()
    
    return data

In [1]:
# API KEY를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API KEY 정보로드
load_dotenv()

True

In [39]:

## look for relevant parts in pdfs

PDF_loader = PyPDFLoader("./si_validation_story/resources/docs/cherry_comliance.pdf")


In [41]:
FILE_PATH = "./si_validation_story/resources/docs/cherry_comliance.pdf"

In [42]:
def show_metadata(docs):
    if docs:
        print("[metadata]")
        print(list(docs[0].metadata.keys()))
        print("\n[examples]")
        max_key_length = max(len(k) for k in docs[0].metadata.keys())
        for k, v in docs[0].metadata.items():
            print(f"{k:<{max_key_length}} : {v}")

## PyPDF

여기에서는 `pypdf`를 사용하여 PDF를 문서 배열로 로드하며, 각 문서는 `page` 번호와 함께 페이지 내용 및 메타데이터를 포함합니다.

In [43]:
# 설치
!pip install -qU pypdf

In [44]:
from langchain_community.document_loaders import PyPDFLoader

# 파일 경로 설정
loader = PyPDFLoader(FILE_PATH)

# PDF 로더 초기화
docs = loader.load()

# 문서의 내용 출력
print(docs[10].page_content[:300])

• Currency must be specified (preferably in USD or AED)  
• Indicate whether charges are prepaid or collect  
• For "Freight Collect" shipments, obtain prior approval from CHERRY's UAE 
office  
2. Operational Requirements  
2.1 Payment Terms  
• Freight collect shipments:  
o Require prior approval


In [45]:
# 메타데이터 출력
show_metadata(docs)

[metadata]
['source', 'page']

[examples]
source : ./si_validation_story/resources/docs/cherry_comliance.pdf
page   : 0


### PyPDF(OCR)

일부 PDF에는 스캔된 문서나 그림 내에 텍스트 이미지가 포함되어 있습니다. `rapidocr-onnxruntime` 패키지를 사용하여 이미지에서 텍스트를 추출할 수도 있습니다.

In [46]:
# 설치
!pip install -qU rapidocr-onnxruntime

In [47]:
# PDF 로더 초기화, 이미지 추출 옵션 활성화
loader = PyPDFLoader(FILE_PATH, extract_images=False)

# PDF 페이지 로드
docs = loader.load()

# 페이지 내용 접근
print(docs[10].page_content[:300])

• Currency must be specified (preferably in USD or AED)  
• Indicate whether charges are prepaid or collect  
• For "Freight Collect" shipments, obtain prior approval from CHERRY's UAE 
office  
2. Operational Requirements  
2.1 Payment Terms  
• Freight collect shipments:  
o Require prior approval


In [48]:
show_metadata(docs)

[metadata]
['source', 'page']

[examples]
source : ./si_validation_story/resources/docs/cherry_comliance.pdf
page   : 0


## PyMuPDF

**PyMuPDF** 는 속도 최적화가 되어 있으며, PDF 및 해당 페이지에 대한 자세한 메타데이터를 포함하고 있습니다. 페이지 당 하나의 문서를 반환합니다:

In [49]:
# 설치
!pip install -qU pymupdf

In [50]:
from langchain_community.document_loaders import PyMuPDFLoader

# PyMuPDF 로더 인스턴스 생성
loader = PyMuPDFLoader(FILE_PATH)

# 문서 로드
docs = loader.load()

# 문서의 내용 출력
print(docs[10].page_content[:300])

• 
Currency must be specified (preferably in USD or AED) 
• 
Indicate whether charges are prepaid or collect 
• 
For "Freight Collect" shipments, obtain prior approval from CHERRY's UAE 
office 
2. Operational Requirements 
2.1 Payment Terms 
• 
Freight collect shipments:  
o Require prior approval 


In [51]:
show_metadata(docs)

[metadata]
['source', 'file_path', 'page', 'total_pages', 'format', 'title', 'author', 'subject', 'keywords', 'creator', 'producer', 'creationDate', 'modDate', 'trapped']

[examples]
source       : ./si_validation_story/resources/docs/cherry_comliance.pdf
file_path    : ./si_validation_story/resources/docs/cherry_comliance.pdf
page         : 0
total_pages  : 379
format       : PDF 1.7
title        : 
author       : 윤경종(KJ Yoon)
subject      : 
keywords     : 
creator      : Microsoft Word
producer     : 
creationDate : D:20241003222425+00'00'
modDate      : D:20241003222425+00'00'
trapped      : 


In [56]:
# !pip install langchain-ai21

In [57]:
import os

os.environ["AI21_API_KEY"] = 'RNDF2u7swmF81gXDMlvmKoR665WiRGoR'

In [65]:
DATA = [doc.page_content for doc in docs]

In [66]:
DATA

['CHERRY Shipping Line Company Policy\r\nCHERRY Shipping Line - Comprehensive Company Policy\r\nCHERRY Shipping Line: UAE - Requirements and Restrictions\r\nCHERRY Shipping Line: United States of America (USA) - Requirements and Restrictions\r\nCHERRY Shipping Line: Qatar - Requirements and Restrictions\r\nCHERRY Shipping Line: Saudi Arabia - Requirements and Restrictions\r\nCHERRY Shipping Line: Jordan - Requirements and Restrictions\r\nCHERRY Shipping Line: Belgium - Requirements and Restrictions \r\nCHERRY Shipping Line: Canada - Requirements and Restrictions\r\nCHERRY Shipping Line: Germany - Requirements and Restrictions\r\nCHERRY Shipping Line: Netherlands – Requirements and Restrictions\r\nCHERRY Shipping Line: Australia - Requirements and Restrictions\r\nCHERRY Shipping Line: Singapore - Requirements and Restrictions\r\nCHERRY Shipping Line: Japan - Requirements and Restrictions\r\nCHERRY Shipping Line: China - Requirements and Restrictions\r\nCHERRY Shipping Line: United Kingd

In [71]:
from langchain_ai21 import AI21SemanticTextSplitter

# TEXT = DATA

semantic_text_splitter = AI21SemanticTextSplitter()
texts = DATA
documents = semantic_text_splitter.create_documents(
    texts=texts
)

print(f"The text has been split into {len(documents)} Documents.")
for doc in documents:
    print(f"metadata: {doc.metadata}")
    print(f"text: {doc.page_content}")
    print("====")

The text has been split into 523 Documents.
metadata: {'source_type': 'normal_text'}
text: CHERRY Shipping Line Company Policy
CHERRY Shipping Line - Comprehensive Company Policy
CHERRY Shipping Line: UAE - Requirements and Restrictions
CHERRY Shipping Line: United States of America (USA) - Requirements and Restrictions
CHERRY Shipping Line: Qatar - Requirements and Restrictions
CHERRY Shipping Line: Saudi Arabia - Requirements and Restrictions
CHERRY Shipping Line: Jordan - Requirements and Restrictions
CHERRY Shipping Line: Belgium - Requirements and Restrictions 
CHERRY Shipping Line: Canada - Requirements and Restrictions
CHERRY Shipping Line: Germany - Requirements and Restrictions
CHERRY Shipping Line: Netherlands – Requirements and Restrictions
CHERRY Shipping Line: Australia - Requirements and Restrictions
CHERRY Shipping Line: Singapore - Requirements and Restrictions
CHERRY Shipping Line: Japan - Requirements and Restrictions
CHERRY Shipping Line: China - Requirements and Res

In [73]:
# 단계 3: 임베딩(Embedding) 생성
embeddings = OpenAIEmbeddings()

In [74]:
# 단계 4: DB 생성(Create DB) 및 저장
# 벡터스토어를 생성합니다.
vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)

In [94]:
vectorstore.save_local("faiss_index_semantic")

In [75]:
for doc in vectorstore.similarity_search("consignee"):
    print(doc.page_content)

The Consignee or Notify or payer collect must sign a confirmation of taking over costs 
for all additional costs incurred and the match code must have an AR-Account with credit 
for collect shipments
CHERRY reserves the right to refuse such shipments without proper arrangements.
The Consignee or Notify or payer collect must sign a confirmation of taking over costs 
for all additional costs incurred and the match code must have an AR-Account with credit 
for collect shipments
3.2 Dangerous Goods
• Acceptance subject to IMDG regulations and CHERRY's internal policies
• Advance booking and approval required
• Proper documentation and labeling mandatory
3.3 Temperature-Controlled Cargo
• Temperature settings must be clearly specified on booking and B/L
• Special requirements may apply for certain commodities
3.4 Out of Gauge (OOG) Cargo
• Subject to special approval and may incur additional charges
• Detailed dimensions and weight distribution must be provided at time of booking
4. Customs

In [36]:
# 단계 5: 검색기(Retriever) 생성
# 문서에 포함되어 있는 정보를 검색하고 생성합니다.
retriever = vectorstore.as_retriever()

In [76]:
# 검색기에 쿼리를 날려 검색된 chunk 결과를 확인합니다.
retriever.invoke("중국 consignee의 필수정보는 무엇인가?")

[Document(metadata={'CHERRY Shipping Line': 'Comprehensive Company Policy', 'source_type': 'normal_text'}, page_content='1.3 Consignee Information\r\n- Full name and complete address are mandatory.\n\n- Contact information (phone and email) must be porvided.\n\n- For “To Order” B/Ls, the notify party must be a company located in the \r\ndestination country.\n\n- "To Order" Consignee:\r\n - Notify party must include CNPJ number.\n\n- Format: "TO ORDER OF [Bank Name]" in consignee field\r\n - Example: "TO ORDER OF BANCO DO BRASIL S.A."\r\n - Consignee must have a physical address in Brazil.\n\n- Format: "Company Name, Street Address, City, State, ZIP Code, Brazil"\r\n - Example: "ABC Importações Ltda., Av.\n\nPaulista 1000, São Paulo, SP, 01310-100, \r\nBrazil"\r\n - For Port of Discharging, China: \r\n USCI (Unified Social Credit Identifier) or OC (Organizing institution bar Code) \r\nshall be written down on Bill of Lading.'),
 Document(metadata={'CHERRY Shipping Line': 'Comprehensive 

In [78]:
data = fetch_data_from_mongodb(collection_name, {"bookingReference": "CHERRY20240911091202"})
data

[{'_id': ObjectId('66e1562f0abd52a6a79a3250'),
  'bookingReference': 'CHERRY20240911091202',
  'voyageDetails': {'vesselName': 'ARONIA 3',
   'voyageNumber': '2024041',
   'bound': 'E'},
  'routeDetails': {'placeOfReceipt': 'BUSAN, KOREA',
   'portOfLoading': 'BUSAN, KOREA',
   'portOfDischarge': 'VLADIVOSTOK, RUSSIA',
   'placeOfDelivery': 'VLADIVOSTOK, RUSSIA',
   'finalDestination': ''},
  'paymentDetails': {'freightPaymentTerms': 'PREPAID',
   'freightPayableAt': 'BUSAN, KOREA'},
  'documentationDetails': {'blType': 'SURRENDER',
   'numberOfOriginalBLs': 0,
   'numberOfCopies': 0},
  'partyDetails': {'shipper': {'name': 'LX PANTOS CO., LTD.',
    'onBehalfOf': 'LG H&H CO., LTD',
    'address': 'LG GWANGHWAMOON BUILDING, 92 SINMUNNO 2-GA, JONGNO-GU, SEOUL 03184, SOUTH KOREA',
    'telephone': '+82-2-3500-0001',
    'fax': '+82-2-3500-1000'},
   'consignee': {'name': 'LLC IC GA GROUP',
    'address': '62 SACCO AND VANZETTI STR. OFFICE 703, 620014,, EKATERINBURG RUSSIA',
    'companyN

In [85]:
prompt = """
You are tasked with verifying Company Compliance based on the provided SI information. 
The compliance must be validated according to the company policies listed in the provided sources. 
Your goal is to identify any compliance violations or discrepancies and output them in the following format:

- Company Policy -
1. [Compliance Issue] (Source [x] page [y])
2. [Compliance Issue] (Source [x] page [y])
3. [Compliance Issue] (Source [x] page [y])
...
[Source]
Source [x]: [Source Title] Page [y], Chapter [z] [Additional details if necessary, e.g., URL]

**SI Information:**
{question}

**Company Policy Sources:**
{context}
"""

In [86]:
# 단계 6: 프롬프트 생성(Create Prompt)
# 프롬프트를 생성합니다.
prompt = PromptTemplate.from_template(prompt)

In [87]:
# 단계 7: 언어모델(LLM) 생성
# 모델(LLM) 을 생성합니다.
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [88]:
# 단계 8: 체인(Chain) 생성
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [92]:
data = """
[{'_id': ObjectId('66e1562f0abd52a6a79a3250'),
  'bookingReference': 'CHERRY20240911091202',
  'voyageDetails': {'vesselName': 'ARONIA 3',
   'voyageNumber': '2024041',
   'bound': 'E'},
  'routeDetails': {'placeOfReceipt': 'BUSAN, KOREA',
   'portOfLoading': 'BUSAN, KOREA',
   'portOfDischarge': 'VLADIVOSTOK, RUSSIA',
   'placeOfDelivery': 'VLADIVOSTOK, RUSSIA',
   'finalDestination': ''},
  'paymentDetails': {'freightPaymentTerms': 'PREPAID',
   'freightPayableAt': 'BUSAN, KOREA'},
  'documentationDetails': {'blType': 'SURRENDER',
   'numberOfOriginalBLs': 0,
   'numberOfCopies': 0},
  'partyDetails': {'shipper': {'name': 'LX PANTOS CO., LTD.',
    'onBehalfOf': 'LG H&H CO., LTD',
    'address': 'LG GWANGHWAMOON BUILDING, 92 SINMUNNO 2-GA, JONGNO-GU, SEOUL 03184, SOUTH KOREA',
    'telephone': '+82-2-3500-0001',
    'fax': '+82-2-3500-1000'},
   'consignee': {'name': 'LLC IC GA GROUP',
    'address': '62 SACCO AND VANZETTI STR. OFFICE 703, 620014,, EKATERINBURG RUSSIA',
    'companyNumber': '1126761020035'},
   'notifyParty': {'name': 'CJSC FM LOGISTICS CUSTOMS',
    'address': 'PAVELTSEVO DISTRICT NOVOE SHOSSE 34 BUILDING 4141720 DOLGOPRUDNIY RUSSIA'}},
  'shippingTerm': 'CY / CY',
  'hsCode': '330420',
  'commodityDescription': 'COSMETICS',
  'containers': [{'containerNumber': 'CRLU9908625',
    'sealNumber': '892083',
    'marksAndNumbers': 'NO MARK',
    'numberOfPackages': 10,
    'packageType': 'PALLETS',
    'cargoDescription': "SHIPPER'S LOAD, COUNT & WEIGHT, SOTW & SEAL SAID TO CONTAIN: COSMETICS",
    'grossWeight': 4829.0,
    'measurement': 12.0,
    'additionalInfo': 'COC : THE GROSS WEIGHT WITHOUT THE CONTAINER WEIGHT. THE CONTAINER IS PROPERTY OF CHERRY.'}],
  'totalShipment': {'totalContainers': 'ONE (45HG X1) CONTAINER ONLY',
   'totalPackages': 10,
   'packageType': 'PALLETS',
   'containerType': '45RHX1',
   'totalGrossWeight': 4829.0,
   'totalMeasurement': 12.0},
  'reeferSettings': {'containerNumber': 'CRLU9908625',
   'temperature': '+5.0ºC',
   'minTemperature': '+2.0°C',
   'maxTemperature': '+7.0°C',
   'ventilation': 'Closed',
   'humidity': '50% ~ 60%'},
  'additionalInformation': {'lcDetails': {'lcNumber': '0000101033738'},
   'certificateDetails': ['4112720070177X', '4112720070178X'],
   'originalBLDistribution': {'name': 'LX PANTOS CO., LTD.',
    'address': 'SAEMUNAN-RO 58, JONGNO-GU, SEOUL 03184, SOUTH KOREA',
    'telephone': '+82-2-3700-2110',
    'fax': '+82-2-3700-2000'},
   'originalInvoiceDistribution': {'name': 'LX PANTOS JAPAN INC.',
    'address': '25F TOKYO SANKEI BUILDING, 1-7-2, OTEMACHI, CHIYODA-KU, TKY 100-0004, JAPAN',
    'telephone': '03600061234',
    'fax': '81-3-6000-0001'},
   'onboardDate': 'OCT. 04. 2024',
   'additionalRemarks': 'Reefer Container, Item: COSMETICS, Temperature: +5°C, Value: USD50,000'}}]
"""

In [93]:
# 체인 실행(Run Chain)
# 문서에 대한 질의를 입력하고, 답변을 출력합니다.
question = data
response = chain.invoke(question)
print(response)

- Company Policy -
1. **Documentation Issue**: The shipment documentation indicates that there are 0 original Bills of Lading (B/Ls) and 0 copies, which violates the requirement of a minimum of three original copies for shipments (Source 1, page 1.1).
2. **Notify Party Issue**: The notify party, CJSC FM LOGISTICS CUSTOMS, is located in Russia, which is acceptable; however, the policy states that the notify party must be a company located in the destination country for certain shipments. This needs to be verified against the specific destination requirements (Source 4, page 1.4).
3. **Shipper Details Issue**: The shipper's contact information includes a telephone number but lacks an email address, which is required according to the policy (Source 2, page 1.2).
4. **Consignee Information Issue**: The consignee's details include a company number but do not provide a contact email, which is mandatory (Source 3, page 1.3).

[Source]
Source 1: CHERRY Shipping Line Comprehensive Company Polic