In [1]:
import os
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_openai  import OpenAIEmbeddings

## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/
from InstructorEmbedding import INSTRUCTOR
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import warnings, re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Suppress all warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import trange


### Set up embedding model to use with GPU

In [2]:
import torch
import torch.nn as nn

# Check if CUDA is available
print(f"CUDA Available: {torch.cuda.is_available()}")

# Print CUDA device name
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3050


### Embedding model definition
A technique for representing text data as numerical vectors, which can be input into machine learning models. The embedding model is responsible for converting text into these vectors.

In [3]:
# Load environment variables from the .env file
load_dotenv()
# Initialize OpenAI client with the API key from environment variables
open_ai_embed_model = OpenAIEmbeddings(openai_api_key= os.environ["OPENAI_API_KEY"],
             model="text-embedding-3-large", 
             max_retries=1000,
             request_timeout=8,
             retry_min_seconds=4,
             show_progress_bar=True,
             )

### OpenAI - extended dictionary

This HS Code dicitonary was scrapped from 2 different sites to increase redundancy and accuracy.

PDFs scrapped from the web - https://www.wcoomd.org/en/topics/nomenclature/instrument-and-tools/hs-nomenclature-2022-edition/hs-nomenclature-2022-edition.aspx

Definitions on WebSite - https://www.dripcapital.com/hts-code/

In [4]:
# Define the persistent directory containing the VectorDB
script_dir = os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir, '..', 'index', 'OpenAI-short-50'))

# Function to extract metadata (HS code) and content
def extract_metadata_and_content(line):
    #match = re.match(r"(\d{4}\.\d{2})\s*(.*)", line)
    
    #Search for any number with 2 decimals like 01.01
    #Bypass chaptpers
    #match = re.match(r"(\d+\.\d+)\s*(.*)", line)
    match = re.match(r"(\d+\.\d+(?:\.\d{2})?)\s*(.*)", line)
    if match:
        metadata = match.group(1)  # The number part as metadata
        content = match.group(2)   # The rest of the line as content
        content = content
        return metadata, content
    return None, line  # In case no match, return the line as-is

# Step 1: Read the .txt file
#file_path = '..\data\hs_code_dictionary_extended.txt'  # Path to your .txt file
with open('..\data\hs_code_dictionary.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Step 2: Split the text into lines first
lines = text.split("\n")

# Initialize the RecursiveCharacterTextSplitter
chunk_size = 50  # Define your chunk size
chunk_overlap = 0  # Set to 0 if you don't want overlapping chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Step 3: Process each line, extract metadata, split content, and create Document objects
documents = []
for line in lines:
    metadata, content = extract_metadata_and_content(line)
    
    if metadata:  # If we found valid metadata
        # Split the content into chunks
        chunks = text_splitter.split_text(content)
        
        # Create a Document object for each chunk, with metadata containing the HS code
        # Si el texto en la descripcion es demasiado largo, lo cortamos en chunks para obtener
        # palabras clave pero con el contexto suficiente
        for chunk in chunks:
            document = Document(page_content=chunk, metadata={"source": metadata})
            #print(metadata, chunk)
            documents.append(document)

# Step 4: Embed the documents into the vector database
vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            # When no GPU is available, initialize vector_db
            #vector_db = Chroma.from_documents([d], embed_model, persist_directory=persistent_dir)
            
            # To enable embeddings running on GPU, ingest documents and create embeddings
            vector_db = Chroma.from_documents([d], open_ai_embed_model, persist_directory=persistent_dir)
        pbar.update(1)
        #print([d])

# The 'documents' list now contains Document objects with metadata and chunks, indexed into the vector database


100%|██████████| 1/1 [00:00<00:00,  2.02it/s]07 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  3.78it/s]07 [00:02<9:35:57,  2.07s/it]
100%|██████████| 1/1 [00:00<00:00,  1.49it/s]07 [00:02<4:43:06,  1.02s/it]
100%|██████████| 1/1 [00:00<00:00,  2.72it/s]07 [00:03<4:01:14,  1.15it/s]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]07 [00:03<3:07:55,  1.48it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]07 [00:03<2:28:40,  1.87it/s]
100%|██████████| 1/1 [00:00<00:00,  3.63it/s]07 [00:03<2:05:57,  2.21it/s]
100%|██████████| 1/1 [00:00<00:00,  3.87it/s]07 [00:04<1:50:45,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  3.26it/s]07 [00:04<1:39:10,  2.81it/s]
100%|██████████| 1/1 [00:00<00:00,  2.22it/s]07 [00:04<1:35:41,  2.91it/s]
100%|██████████| 1/1 [00:00<00:00,  2.12it/s]707 [00:05<1:45:32,  2.64it/s]
100%|██████████| 1/1 [00:00<00:00,  4.24it/s]707 [00:05<1:54:24,  2.43it/s]
100%|██████████| 1/1 [00:00<00:00,  4.19it/s]707 [00:06<1:40:17,  2.77it/s]
100%|██████████| 1/1 [00:00<00:0