In [1]:
import os
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
#import dotenv
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain.schema import Document

## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/
from InstructorEmbedding import INSTRUCTOR
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import trange


In [2]:
import torch
import torch.nn as nn

# Check if CUDA is available
print(f"CUDA Available: {torch.cuda.is_available()}")

# Print CUDA device name
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3050


In [3]:
# Ensure GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### 2.- Chunking
Is the process of breaking down a large input text into smaller pieces.
This ensures that the text fits the input size of the embedding model and improves retrieval efficiency.

In [4]:
#from llama_index.core import SimpleDirectoryReader
#WARNING!: When using this, any "page_content" furthewr instructon won't work.
## load data
#loader = SimpleDirectoryReader(
#            input_dir = 'Demanda',
#            required_exts=[".pdf"],
#            recursive=True
#        )
#docs = loader.load_data()


#loader = PyPDFLoader(r"Casa\Codigo_Civil_del_Estado_de_San_Luis_Potosi_16_May_2024_compressed.pdf")
#documents = loader.load()
#print(len(documents))

# Define the directory containing the PDF files
# Step 1: Load your CSV file
df = pd.read_csv('HS Code catalogue.csv')

# Split data and metadata
texts = df['description'].tolist()  # This is the text data that will be embedded
metadata = df['code'].tolist()  # This is the metadata that will be stored alongside the embeddings


# Define the persistent directory containing the VectorDB
persistent_dir= "VectorDB/HS_codes/model_with_HS_Code_catalogue"

# WARNING! :Only runs with this version
###### !pip install sentence-transformers==2.2.2  ######
#Define the sentence-transformer model:

#For English
#sentence-transformers/LaBSE
#embed_model = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = "sentence-transformers/all-mpnet-base-v2"

#For Spanish 
#projecte-aina/aguila-7b
#embed_model = "hiiamsid/sentence_similarity_spanish_es"

#Other sentence-transformer settings
model_kwargs = {'device': 'cuda:0'}  # specify GPU device
encode_kwargs = {'normalize_embeddings': True}

### 3.- Embedding

A technique for representing text data as numerical vectors, which can be input into machine learning models. The embedding model is responsible for converting text into these vectors.

In [5]:
#Only runs with this version
#!pip install sentence-transformers==2.2.2

hf_embed_model = HuggingFaceInstructEmbeddings(
    model_name=embed_model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


load INSTRUCTOR_Transformer
max_seq_length  512


### 4.- Vector DB

A collection of pre-computed vector representations of text data for fast retrieval and similarity search, with capabilities like CRUD operations, metadata filtering, and horizontal scaling.

In [6]:
chroma_db = Chroma(
    collection_name="csv_collection",  # Name for the Chroma collection
    embedding_function=hf_embed_model.embed_query,  # Function for query embeddings
    persist_directory=persistent_dir
)

documents = []
for i, text in enumerate(texts):
    document = Document(page_content=text, metadata={"source": metadata[i]})
    documents.append(document)

# Debug to check metadata + text
print(documents[0])
print(documents[1])    
print(documents[2])   

page_content='Horses: live, pure-bred breeding animals' metadata={'source': 10121}
page_content='Horses: live, other than pure-bred breeding animals' metadata={'source': 10129}
page_content='Asses: live' metadata={'source': 10130}


In [7]:
vector_db = None
with tqdm(total=len(documents), desc="Creating embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            #When no GPU is available
            #vector_db = Chroma.from_documents([d],embed_model, persist_directory=persistent_dir )
            
            #To enable embeddings running on GPU: embedding and ingesting at the same time
            vector_db = Chroma.from_documents([d],hf_embed_model, persist_directory=persistent_dir)
        pbar.update(1)    

Creating embeddings...: 100%|██████████| 5384/5384 [02:03<00:00, 43.50it/s]


### Simple test over Vector DB

This response would contain the answer to that question, this needs to be sent to the LLM to paraphrase and extract only the needed info

In [8]:
question = "Live Pig? "
docs = vector_db.similarity_search(question)
print(docs)

[Document(metadata={'source': 10391}, page_content='Swine: live, other than pure-bred breeding animals, weighing less than 50kg'), Document(metadata={'source': 10392}, page_content='Swine: live, other than pure-bred breeding animals, weighing 50kg or more'), Document(metadata={'source': 10310}, page_content='Swine: live, pure-bred breeding animals'), Document(metadata={'source': 10410}, page_content='Sheep: live')]


### Q&A Chain

This will run peristantly, consulting already stored vectors

### Using a LLM Chain to provide context and system prompts on every query

In [13]:
#from langchain_chroma import Chroma
from langchain_community.vectorstores import Chroma

#For some reason, "context" cant be used as input variable, it should be named as "summaries"
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate

##### Embedding model (sentence-transformer)
model_name = embed_model
model_kwargs = {'device': 'cuda:0'}  # specify GPU device
encode_kwargs = {'normalize_embeddings': True}
hf_embed_model = HuggingFaceInstructEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

##### LLM model
# loading the Llama3 model from local
llm_model = OllamaLLM(model="llama3.1",
                temperature=0.1,
                num_thread=8,
                )
# loading the vectorstore
vectorstore = Chroma(persist_directory=persistent_dir, embedding_function=hf_embed_model)
# casting  the vectorstore as the retriever, taking the best 3 similarities
retriever = vectorstore.as_retriever(search_kwargs={"k":3})

# formating the docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

template = """Always answer in English. 
If you can infer an specific code (based on a decent match) you must respond only with this kind fo format "XXXX.XX" and answer it at the beggining with: 
"The suggested HS code is: " 

In the case you can't find a matching code for the description, respond with:
'Your inquiry is very subjective, to provide an accurate HS Code I need you to be more specific, please include size, weight, origin, meaning of use, etc. 
So far, I can narrow it to this chapters/parts : 'provide 3 possible codes".

context:
{summaries}

Question:
{question}
"""

# Define the LLM chain (using the Llama3.1 model)
llm_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm_model,
    chain_type='stuff',
    retriever=retriever,
    return_source_documents=True,  # To get both the answer and source docs
    chain_type_kwargs={
            "prompt": PromptTemplate(
                template=template,
                #For some reason, "context" cant be used as input variable, it should be named as "summaries"
                input_variables=["question", "summaries"],
            ),
        },
)

context = """As a logistics shipping arrival inspector, your primary responsibility is to inspect incoming shipments and accurately classify goods 
using the Harmonized System (HS) code based on the descriptions provided in the shipping manifests. You will thoroughly review the manifest details, 
including product type, material composition, function, and intended use, to determine the correct HS code. 

Your task is to:
Carefully read and analyze the product descriptions from the manifest.
Identify key characteristics of the goods, such as 
type (e.g., electronics, textiles, machinery), 
material (e.g., plastic, metal, organic), 
and usage (e.g., household, industrial, medical).
Use your knowledge of the HS code classification system to assign the most appropriate HS code for each product based on its description.
Ensure compliance with international trade regulations by selecting precise codes to avoid delays or penalties.
Remember to be thorough and accurate in your classification, as this impacts customs processing, tariffs, and legal requirements."""


#-----Benchmark query------
#Response should be: "0106.19" 
#query = "Live Dog"
query = "A Live Dog of breed Schnauzer from Germany"
#query = "Live Pig"
#query = "Live Buffalo"
#query = "What HS Code belongs to a wrench?"
#query = "What HS Code belongs to a live dog?"
#query = "What HS Code belongs to:'CERAMIC TABLEWARE  2688 BOXES PO 5 39548'?"
#query = "Covers for cellphone screen"

# Execute the chain with the query
#For some reason, "context" cant be used as input variable, it should be named as "summaries"
result = llm_chain({"question": query, "summaries": context})

# Process the results
#print(result.keys())
print("The response of the LLM is: ", result["answer"])

#print("The documents content used for this response are:")
#for i in range(len(result["source_documents"])):
#    print(result["source_documents"][i].page_content)
#    print(result["source_documents"][i].metadata)
     
#results = vectorstore.similarity_search_with_score(query=query, k=3)
## Print similarity results
#for doc, score in results:
#    print(f"Document content: {doc.page_content}, Code: {doc.metadata},Similarity Score: {score}")



load INSTRUCTOR_Transformer
max_seq_length  512
The response of the LLM is:  The suggested HS code is: 0109.10
