In [1]:
import os
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_openai  import OpenAIEmbeddings

## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/
from InstructorEmbedding import INSTRUCTOR
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import warnings, re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Suppress all warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import trange


### Set up embedding model to use with GPU

In [2]:
import torch
import torch.nn as nn

# Check if CUDA is available
print(f"CUDA Available: {torch.cuda.is_available()}")

# Print CUDA device name
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3050


### Embedding model definition
A technique for representing text data as numerical vectors, which can be input into machine learning models. The embedding model is responsible for converting text into these vectors.

In [3]:
# Load environment variables from the .env file
load_dotenv()
# Initialize OpenAI client with the API key from environment variables
open_ai_embed_model = OpenAIEmbeddings(openai_api_key= os.environ["OPENAI_API_KEY"],
             model="text-embedding-3-large", 
             max_retries=1000,
             request_timeout=8,
             retry_min_seconds=4,
             show_progress_bar=True,
             )

### OpenAI - extended dictionary

This HS Code dicitonary was scrapped from 2 different sites to increase redundancy and accuracy.

PDFs scrapped from the web - https://www.wcoomd.org/en/topics/nomenclature/instrument-and-tools/hs-nomenclature-2022-edition/hs-nomenclature-2022-edition.aspx

Definitions on WebSite - https://www.dripcapital.com/hts-code/

In [4]:
# Define the persistent directory containing the VectorDB
script_dir =  os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir,'..' ,'index', 'OpenAI-table'))

# Importante Leer columna de codigo como string, sino se eliminan los zeros a la izquierda.
df = pd.read_csv('..\data\WebScrap_CSVs\hts_codes_WebScrapped.csv', encoding='utf-8', dtype={'HTS code': str})

# Split data and metadata
texts = df['Description'].tolist()  # This is the text data that will be embedded
metadata = df['HTS code'].tolist()  # This is the metadata that will be stored alongside the embeddings

documents = []
for i, text in enumerate(texts):
    document = Document(page_content=text, metadata={"source": metadata[i]})
    documents.append(document)

# Debug to check metadata + text
print(documents[0])
print(documents[1])    
print("Amount of documents is: " , len(documents))   

vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            #When no GPU is available
            #vector_db = Chroma.from_documents([d],embed_model, persist_directory=persistent_dir )
            
            #To enable embeddings running on GPU: embedding and ingesting at the same time
            vector_db = Chroma.from_documents([d],open_ai_embed_model, persist_directory=persistent_dir)
        pbar.update(1)    

page_content='Pure-bred breeding horses' metadata={'source': '0101.21'}
page_content='Live horses (excluding pure-bred for breeding)' metadata={'source': '0101.29'}
Amount of documents is:  5541


100%|██████████| 1/1 [00:00<00:00,  2.17it/s]1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  2.20it/s]1 [00:01<2:50:37,  1.85s/it]
100%|██████████| 1/1 [00:00<00:00,  4.23it/s]1 [00:02<1:35:04,  1.03s/it]
100%|██████████| 1/1 [00:00<00:00,  3.10it/s]1 [00:02<1:02:18,  1.48it/s]
100%|██████████| 1/1 [00:00<00:00,  1.96it/s]1 [00:02<49:44,  1.86it/s]  
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]1 [00:03<49:03,  1.88it/s]
100%|██████████| 1/1 [00:00<00:00,  3.95it/s]1 [00:03<40:25,  2.28it/s]
100%|██████████| 1/1 [00:00<00:00,  2.48it/s]1 [00:03<35:14,  2.62it/s]
100%|██████████| 1/1 [00:00<00:00,  2.69it/s]1 [00:04<36:03,  2.56it/s]
100%|██████████| 1/1 [00:00<00:00,  4.49it/s]1 [00:04<35:34,  2.59it/s]
100%|██████████| 1/1 [00:00<00:00,  2.17it/s]41 [00:04<31:11,  2.96it/s]
100%|██████████| 1/1 [00:00<00:00,  2.35it/s]41 [00:05<34:48,  2.65it/s]
100%|██████████| 1/1 [00:00<00:00,  4.45it/s]41 [00:05<36:21,  2.53it/s]
100%|██████████| 1/1 [00:00<00:00,  4.00it/s]41 [00:06<31:51,