In [2]:
import os
from pymongo import MongoClient
from typing import List, Dict

MONGO_URI = os.getenv("MONGODB_URI")
DB_NAME = os.getenv("MONGODB_DB_NAME")
collection_name = 'si'
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "containergenie.ai"
os.environ['USER_AGENT'] = 'chapter2-1'

####################################################################################

from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import create_openai_functions_agent, AgentExecutor

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

###################################################################################

# block included to check whether the whole chain works out or not
def fetch_data_from_mongodb(collection_name: str, query: Dict = None, limit: int = None) -> List[Dict]:
 
    client = MongoClient(MONGO_URI)
    db = client[DB_NAME]
    collection = db[collection_name]
    
    # Prepare the find operation
    find_operation = collection.find(query) if query else collection.find()
    
    # Fetch and return the data
    data = list(find_operation)
    
    # Close the connection
    client.close()
    
    return data

In [3]:
# API KEY를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API KEY 정보로드
load_dotenv()

True

In [4]:

## look for relevant parts in pdfs

PDF_loader = PyPDFLoader("./si_validation_story/resources/docs/cherry_comliance.pdf")


In [5]:
FILE_PATH = "./si_validation_story/resources/docs/cherry_comliance.pdf"

In [6]:
def show_metadata(docs):
    if docs:
        print("[metadata]")
        print(list(docs[0].metadata.keys()))
        print("\n[examples]")
        max_key_length = max(len(k) for k in docs[0].metadata.keys())
        for k, v in docs[0].metadata.items():
            print(f"{k:<{max_key_length}} : {v}")

## PyPDF

여기에서는 `pypdf`를 사용하여 PDF를 문서 배열로 로드하며, 각 문서는 `page` 번호와 함께 페이지 내용 및 메타데이터를 포함합니다.

In [7]:
# 설치
!pip install -qU pypdf

In [8]:
from langchain_community.document_loaders import PyPDFLoader

# 파일 경로 설정
loader = PyPDFLoader(FILE_PATH)

# PDF 로더 초기화
docs = loader.load()

# 문서의 내용 출력
print(docs[10].page_content[:300])

• Currency must be specified (preferably in USD or AED)  
• Indicate whether charges are prepaid or collect  
• For "Freight Collect" shipments, obtain prior approval from CHERRY's UAE 
office  
2. Operational Requirements  
2.1 Payment Terms  
• Freight collect shipments:  
o Require prior approval


In [9]:
# 메타데이터 출력
show_metadata(docs)

[metadata]
['source', 'page']

[examples]
source : ./si_validation_story/resources/docs/cherry_comliance.pdf
page   : 0


### PyPDF(OCR)

일부 PDF에는 스캔된 문서나 그림 내에 텍스트 이미지가 포함되어 있습니다. `rapidocr-onnxruntime` 패키지를 사용하여 이미지에서 텍스트를 추출할 수도 있습니다.

In [10]:
# 설치
!pip install -qU rapidocr-onnxruntime

In [11]:
# PDF 로더 초기화, 이미지 추출 옵션 활성화
loader = PyPDFLoader(FILE_PATH, extract_images=False)

# PDF 페이지 로드
docs = loader.load()

# 페이지 내용 접근
print(docs[10].page_content[:300])

• Currency must be specified (preferably in USD or AED)  
• Indicate whether charges are prepaid or collect  
• For "Freight Collect" shipments, obtain prior approval from CHERRY's UAE 
office  
2. Operational Requirements  
2.1 Payment Terms  
• Freight collect shipments:  
o Require prior approval


In [12]:
show_metadata(docs)

[metadata]
['source', 'page']

[examples]
source : ./si_validation_story/resources/docs/cherry_comliance.pdf
page   : 0


## PyMuPDF

**PyMuPDF** 는 속도 최적화가 되어 있으며, PDF 및 해당 페이지에 대한 자세한 메타데이터를 포함하고 있습니다. 페이지 당 하나의 문서를 반환합니다:

In [13]:
# 설치
!pip install -qU pymupdf

In [14]:
from langchain_community.document_loaders import PyMuPDFLoader

# PyMuPDF 로더 인스턴스 생성
loader = PyMuPDFLoader(FILE_PATH)

# 문서 로드
docs = loader.load()

# 문서의 내용 출력
print(docs[10].page_content[:300])

• 
Currency must be specified (preferably in USD or AED) 
• 
Indicate whether charges are prepaid or collect 
• 
For "Freight Collect" shipments, obtain prior approval from CHERRY's UAE 
office 
2. Operational Requirements 
2.1 Payment Terms 
• 
Freight collect shipments:  
o Require prior approval 


In [15]:
show_metadata(docs)

[metadata]
['source', 'file_path', 'page', 'total_pages', 'format', 'title', 'author', 'subject', 'keywords', 'creator', 'producer', 'creationDate', 'modDate', 'trapped']

[examples]
source       : ./si_validation_story/resources/docs/cherry_comliance.pdf
file_path    : ./si_validation_story/resources/docs/cherry_comliance.pdf
page         : 0
total_pages  : 379
format       : PDF 1.7
title        : 
author       : 윤경종(KJ Yoon)
subject      : 
keywords     : 
creator      : Microsoft Word
producer     : 
creationDate : D:20241003222425+00'00'
modDate      : D:20241003222425+00'00'
trapped      : 


In [16]:
# !pip install langchain-ai21

In [17]:
import os

os.environ["AI21_API_KEY"] = 'RNDF2u7swmF81gXDMlvmKoR665WiRGoR'

In [65]:
DATA = [doc.page_content for doc in docs]

In [18]:
DATA

NameError: name 'DATA' is not defined

In [19]:
# from langchain_ai21 import AI21SemanticTextSplitter

# # TEXT = DATA

# semantic_text_splitter = AI21SemanticTextSplitter()
# texts = DATA
# documents = semantic_text_splitter.create_documents(
#     texts=texts
# )

# print(f"The text has been split into {len(documents)} Documents.")
# for doc in documents:
#     print(f"metadata: {doc.metadata}")
#     print(f"text: {doc.page_content}")
#     print("====")

In [20]:
# 단계 3: 임베딩(Embedding) 생성
# embeddings = OpenAIEmbeddings()

In [21]:
# 단계 4: DB 생성(Create DB) 및 저장
# 벡터스토어를 생성합니다.
# vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)

In [22]:
# vectorstore.save_local("faiss_index_semantic")

In [40]:
vectorstore = FAISS.load_local("faiss_index", OpenAIEmbeddings(), allow_dangerous_deserialization=True)

In [41]:
for doc in vectorstore.similarity_search("consignee"):
    print(doc.page_content)

consignee 1.5 Cargo Description • Detailed and accurate description of goods is mandatory • Vague descriptions are not acceptable • Must include:  o Precise and accurate commodity description o Number of packages o Type of packages (e.g., cartons, pallets, drums) • Prohibited terms:  o "Said to Contain"
consignee 1.5 Cargo Description • Detailed and accurate description of goods is mandatory • Vague descriptions are not acceptable • Must include:  o Precise and accurate commodity description o Number of packages o Type of packages (e.g., pallets, cartons, drums) • Prohibited terms:  o "Said to Contain"
• "SAME AS CONSIGNEE" is acceptable 1.5 Cargo Description • Detailed and accurate description of goods is mandatory • Generic terms like "Consolidated Cargo" or "Said to Contain" are not accepted • Must include:  o Precise commodity description o Number of packages o Type of packages (e.g., pallets, cartons, drums) 1.6 Harmonized System (HS) Codes • Mandatory • Codes used according to th

In [42]:
# 단계 5: 검색기(Retriever) 생성
# 문서에 포함되어 있는 정보를 검색하고 생성합니다.
retriever = vectorstore.as_retriever()

In [43]:
# 검색기에 쿼리를 날려 검색된 chunk 결과를 확인합니다.
retriever.invoke("중국 consignee의 필수정보는 무엇인가?")

[Document(metadata={'source': './si_validation_story/resources/docs/cherry_compliance.pdf', 'page': 113}, page_content='Number required for Korea-based consignees • It is required to have an address in Korea either for Consignee or Notify Party 1.4 Notify Party Details • Full name and complete address required'),
 Document(metadata={'source': './si_validation_story/resources/docs/cherry_compliance.pdf', 'page': 60}, page_content='• Unified Social Credit Code required for China-based shippers 1.3 Consignee Details • Full name and complete address required • Contact information:  o Phone number (mandatory) o Email address (mandatory) • Unified Social Credit Code required for China-based consignees • For personal imports, full name and ID number required 1.4 Notify Party Details • Full name and complete address required • Contact information:  o Phone number (mandatory) o Email address (mandatory) 1.5 Cargo Description • Detailed and accurate description of goods is mandatory in both Engl

In [44]:
# 검색기에 쿼리를 날려 검색된 chunk 결과를 확인합니다.
retriever.invoke("러시아로 가는 배의 필수정보는 무엇인가?")

[Document(metadata={'source': './si_validation_story/resources/docs/cherry_compliance.pdf', 'page': 160}, page_content='updated: [Insert date] 10. Key Contacts • Federal Customs Service of Russia: www.customs.gov.ru • Ministry of Transport of the Russian Federation: www.mintrans.gov.ru • Russian Maritime Register of Shipping: www.rs-class.org • Federal Service for Veterinary and Phytosanitary Surveillance: www.fsvps.gov.ru • Russian Union of Industrialists and Entrepreneurs: www.rspp.ru  Ñ CHERRY Shipping Line:  Turkey - Requirements and Restrictions 1. Documentation Requirements 1.1 Bill of Lading (B/L) Types • Original Bill of Lading (OBL) • Sea Waybill (SWB) • Electronic Bill of Lading (e-B/L) available upon request'),
 Document(metadata={'source': './si_validation_story/resources/docs/cherry_compliance.pdf', 'page': 155}, page_content='o Dangerous Goods Declaration (in Russian and English) o Safety Data Sheet (SDS) in Russian and English • Proper UN packaging and labeling mandatory

In [45]:
data = fetch_data_from_mongodb(collection_name, {"bookingReference": "CHERRY20240911091202"})
data

[{'_id': ObjectId('66e1562f0abd52a6a79a3250'),
  'bookingReference': 'CHERRY20240911091202',
  'voyageDetails': {'vesselName': 'ARONIA 3',
   'voyageNumber': '2024041',
   'bound': 'E'},
  'routeDetails': {'placeOfReceipt': 'BUSAN, KOREA',
   'portOfLoading': 'BUSAN, KOREA',
   'portOfDischarge': 'VLADIVOSTOK, RUSSIA',
   'placeOfDelivery': 'VLADIVOSTOK, RUSSIA',
   'finalDestination': ''},
  'paymentDetails': {'freightPaymentTerms': 'PREPAID',
   'freightPayableAt': 'BUSAN, KOREA'},
  'documentationDetails': {'blType': 'SURRENDER',
   'numberOfOriginalBLs': 0,
   'numberOfCopies': 0},
  'partyDetails': {'shipper': {'name': 'LX PANTOS CO., LTD.',
    'onBehalfOf': 'LG H&H CO., LTD',
    'address': 'LG GWANGHWAMOON BUILDING, 92 SINMUNNO 2-GA, JONGNO-GU, SEOUL 03184, SOUTH KOREA',
    'telephone': '+82-2-3500-0001',
    'fax': '+82-2-3500-1000'},
   'consignee': {'name': 'LLC IC GA GROUP',
    'address': '62 SACCO AND VANZETTI STR. OFFICE 703, 620014,, EKATERINBURG RUSSIA',
    'companyN

In [46]:
prompt = """
You are tasked with verifying Company Compliance based on the provided SI information. 
The compliance must be validated according to the company policies listed in the provided sources. 
Your goal is to identify any compliance violations or discrepancies and output them in the following format:

- Company Policy -
1. [Compliance Issue] (Source [x] page [y])
2. [Compliance Issue] (Source [x] page [y])
3. [Compliance Issue] (Source [x] page [y])
...
[Source]
Source [x]: [Source Title] Page [y], Chapter [z] [Additional details if necessary, e.g., URL]

**SI Information:**
{question}

**Company Policy Sources:**
{context}
"""

In [47]:
# 단계 6: 프롬프트 생성(Create Prompt)
# 프롬프트를 생성합니다.
prompt = PromptTemplate.from_template(prompt)

In [48]:
# 단계 7: 언어모델(LLM) 생성
# 모델(LLM) 을 생성합니다.
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [49]:
# 단계 8: 체인(Chain) 생성
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [54]:
data = """
[{'_id': ObjectId('66e1562f0abd52a6a79a3250'),
  'bookingReference': 'CHERRY20240911091202',
  'voyageDetails': {'vesselName': 'ARONIA 3',
   'voyageNumber': '2024041',
   'bound': 'E'},
  'routeDetails': {'placeOfReceipt': 'BUSAN, KOREA',
   'portOfLoading': 'BUSAN, KOREA',
   'portOfDischarge': 'VLADIVOSTOK, RUSSIA',
   'placeOfDelivery': 'VLADIVOSTOK, RUSSIA',
   'finalDestination': ''},
  'paymentDetails': {'freightPaymentTerms': 'PREPAID',
   'freightPayableAt': 'BUSAN, KOREA'},
  'documentationDetails': {'blType': 'SURRENDER',
   'numberOfOriginalBLs': 0,
   'numberOfCopies': 0},
  'partyDetails': {'shipper': {'name': 'LX PANTOS CO., LTD.',
    'onBehalfOf': 'LG H&H CO., LTD',
    'address': 'LG GWANGHWAMOON BUILDING, 92 SINMUNNO 2-GA, JONGNO-GU, SEOUL 03184, SOUTH KOREA',
    'telephone': '+82-2-3500-0001',
    'fax': '+82-2-3500-1000'},
   'consignee': {'name': 'LLC IC GA GROUP',
    'address': '62 SACCO AND VANZETTI STR. OFFICE 703, 620014,, EKATERINBURG RUSSIA',
    'companyNumber': '1126761020035'},
   'notifyParty': {'name': 'CJSC FM LOGISTICS CUSTOMS',
    'address': 'PAVELTSEVO DISTRICT NOVOE SHOSSE 34 BUILDING 4141720 DOLGOPRUDNIY RUSSIA'}},
  'shippingTerm': 'CY / CY',
  'hsCode': '330420',
  'commodityDescription': 'COSMETICS',
  'containers': [{'containerNumber': 'CRLU9908625',
    'sealNumber': '892083',
    'marksAndNumbers': 'NO MARK',
    'numberOfPackages': 10,
    'packageType': 'PALLETS',
    'cargoDescription': "SHIPPER'S LOAD, COUNT & WEIGHT, SOTW & SEAL SAID TO CONTAIN: COSMETICS",
    'grossWeight': 4829.0,
    'measurement': 12.0,
    'additionalInfo': 'COC : THE GROSS WEIGHT WITHOUT THE CONTAINER WEIGHT. THE CONTAINER IS PROPERTY OF CHERRY.'}],
  'totalShipment': {'totalContainers': 'ONE (45HG X1) CONTAINER ONLY',
   'totalPackages': 10,
   'packageType': 'PALLETS',
   'containerType': '45RHX1',
   'totalGrossWeight': 4829.0,
   'totalMeasurement': 12.0},
  'reeferSettings': {'containerNumber': 'CRLU9908625',
   'temperature': '+5.0ºC',
   'minTemperature': '+2.0°C',
   'maxTemperature': '+7.0°C',
   'ventilation': 'Closed',
   'humidity': '50% ~ 60%'},
  'additionalInformation': {'lcDetails': {'lcNumber': '0000101033738'},
   'certificateDetails': ['4112720070177X', '4112720070178X'],
   'originalBLDistribution': {'name': 'LX PANTOS CO., LTD.',
    'address': 'SAEMUNAN-RO 58, JONGNO-GU, SEOUL 03184, SOUTH KOREA',
    'telephone': '+82-2-3700-2110',
    'fax': '+82-2-3700-2000'},
   'originalInvoiceDistribution': {'name': 'LX PANTOS JAPAN INC.',
    'address': '25F TOKYO SANKEI BUILDING, 1-7-2, OTEMACHI, CHIYODA-KU, TKY 100-0004, JAPAN',
    'telephone': '03600061234',
    'fax': '81-3-6000-0001'},
   'onboardDate': 'OCT. 04. 2024',
   'additionalRemarks': 'Reefer Container, Item: COSMETICS, Temperature: +5°C, Value: USD50,000'}}]
"""

In [55]:
# data = """
# ### **Booking Reference & Voyage Details**
# - **Booking Reference**: CHERRY20240911091202
# - **Vessel Name**: ARONIA 3
# - **Voyage Number**: 2024041
# - **Bound**: East (E)

# ### **Route Details**
# - **Place of Receipt**: Busan, Korea
# - **Port of Loading**: Busan, Korea
# - **Port of Discharge**: Vladivostok, Russia
# - **Place of Delivery**: Vladivostok, Russia

# ### **Payment Details**
# - **Freight Payment Terms**: Prepaid
# - **Freight Payable At**: Busan, Korea

# ### **Documentation Details**
# - **BL Type**: Surrender
# - **Number of Original BLs**: 0
# - **Number of Copies**: 0

# ### **Party Details**
# - **Shipper**: LX PANTOS CO., LTD. (on behalf of LG H&H CO., LTD.)
# - **Consignee**: LLC IC GA GROUP (Russia)
# - **Notify Party**: CJSC FM LOGISTICS CUSTOMS (Russia)

# ### **Shipping Term**
# - **CY / CY**: Container Yard to Container Yard

# ### **Commodity & HS Code**
# - **HS Code**: 330420 (Cosmetics)
# - **Commodity Description**: Cosmetics

# ### **Container Details**
# - **Container Number**: CRLU9908625
# - **Seal Number**: 892083
# - **Marks & Numbers**: No mark
# - **Number of Packages**: 10
# - **Package Type**: Pallets
# - **Cargo Description**: "Shipper's Load, Count & Weight, SOTW & Seal Said to Contain: Cosmetics"
# - **Gross Weight**: 4829.0 kg
# - **Measurement**: 12.0 CBM

# ### **Total Shipment**
# - **Total Containers**: ONE (45HG X1) Container Only
# - **Total Packages**: 10 Pallets
# - **Total Gross Weight**: 4829.0 kg
# - **Total Measurement**: 12.0 CBM
# - **Container Type**: 45RHX1

# ### **Reefer Settings**
# - **Container Number**: CRLU9908625
# - **Temperature**: +5.0ºC
# - **Min Temperature**: +2.0°C
# - **Max Temperature**: +7.0°C
# - **Ventilation**: Closed
# - **Humidity**: 50% ~ 60%

# ### **Additional Information**
# - **LC Number**: 0000101033738
# - **Certificates**: 4112720070177X, 4112720070178X
# - **Original BL Distribution**: LX PANTOS CO., LTD.
# - **Original Invoice Distribution**: LX PANTOS JAPAN INC.
# - **Onboard Date**: October 4, 2024
# - **Additional Remarks**: Reefer Container, Cosmetics, Temperature +5°C, Value: USD50,000

# ### **Key Validation Targets**:
# - **Commodity (HS Code 330420)**: Cross-check for compliance with export/import restrictions.
# - **Gross Weight & Measurement**: Ensure compliance with one decimal digit policy (e.g., 4829.0 kg and 12.0 CBM).
# - **Number of Packages**: Validate against total shipment numbers to check for any discrepancies.
# - **Shipping Documents**: Ensure proper documentation, such as BL Type (Surrender), and check if the number of original BLs is correct.
# - **Temperature & Reefer Settings**: Verify whether the temperature and other reefer settings match the required conditions for the shipped goods (cosmetics).
# - **Container Details**: Ensure the container description and cargo details are correctly recorded according to company policies (e.g., no commercial value hidden in descriptions).

# """

In [56]:
# 체인 실행(Run Chain)
# 문서에 대한 질의를 입력하고, 답변을 출력합니다.
question = data
response = chain.invoke(question)
print(response)

- Company Policy -
1. **Missing Detailed Packing List**: The detailed and readable Packing List needs to be uploaded in the CHERRY system. (Source [1] page [31])
2. **No Original Bills of Lading**: The documentation indicates that there are no original Bills of Lading, which is against the requirement for Original Bill of Lading (OBL) to be surrendered to release cargo. (Source [4] page [235])
3. **Temperature-Controlled Cargo Compliance**: The temperature settings must be clearly specified on booking and B/L, and special requirements may apply for certain commodities. (Source [1] page [31])
4. **Confirmation of Costs**: The Consignee or Notify or payer collect must sign a confirmation of taking over costs for all additional costs incurred, which is not indicated in the provided SI information. (Source [1] page [31])

[Source]
Source [1]: CHERRY Compliance Document Page 31, Chapter 3.3
Source [2]: CHERRY Compliance Document Page 235, Chapter 1.1
Source [3]: CHERRY Compliance Document P