In [1]:
import os
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_openai  import OpenAIEmbeddings

## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/
from InstructorEmbedding import INSTRUCTOR
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import warnings, re

# Suppress all warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import trange


### Set up embedding model to use with GPU

In [2]:
import torch
import torch.nn as nn

# Check if CUDA is available
print(f"CUDA Available: {torch.cuda.is_available()}")

# Print CUDA device name
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3050


### Embedding model definition
A technique for representing text data as numerical vectors, which can be input into machine learning models. The embedding model is responsible for converting text into these vectors.

In [3]:
# WARNING! :Only runs with this version
###### !pip install sentence-transformers==2.2.2  ######
#Define the sentence-transformer model:

#For English
#sentence-transformers/LaBSE
#embed_model = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = "sentence-transformers/all-mpnet-base-v2"

#For Spanish 
#projecte-aina/aguila-7b
#embed_model = "hiiamsid/sentence_similarity_spanish_es"

#Other sentence-transformer settings
model_kwargs = {'device': 'cuda:0'}  # specify GPU device
encode_kwargs = {'normalize_embeddings': True}

hf_embed_model = HuggingFaceInstructEmbeddings(
    model_name=embed_model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Load environment variables from the .env file
load_dotenv()
# Initialize OpenAI client with the API key from environment variables
open_ai_embed_model = OpenAIEmbeddings(openai_api_key= os.environ["OPENAI_API_KEY"],
             model="text-embedding-3-large", 
             max_retries=1000,
             request_timeout=8,
             retry_min_seconds=4,
             show_progress_bar=True,
             )

load INSTRUCTOR_Transformer
max_seq_length  512


### Test A: Create index without chunking method

We read a tabular CSV, each row will be a complete chunk

In [None]:
# Define the persistent directory containing the VectorDB
script_dir =  os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir,'..' ,'index', 'TestA'))

# Importante Leer columna de codigo como string, sino se eliminan los zeros a la izquierda.
df = pd.read_csv('..\data\WebScrap_CSVs\hts_codes_WebScrapped.csv', encoding='utf-8', dtype={'HTS code': str})

# Split data and metadata
texts = df['Description'].tolist()  # This is the text data that will be embedded
metadata = df['HTS code'].tolist()  # This is the metadata that will be stored alongside the embeddings

chroma_db = Chroma(
    collection_name="csv_collection",  # Name for the Chroma collection
    embedding_function=hf_embed_model.embed_query,  # Function for query embeddings
    persist_directory=persistent_dir
)

documents = []
for i, text in enumerate(texts):
    document = Document(page_content=text, metadata={"source": metadata[i]})
    documents.append(document)

# Debug to check metadata + text
print(documents[0])
print(documents[1])    
print("Amount of documents is: " , len(documents))   

vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            #When no GPU is available
            #vector_db = Chroma.from_documents([d],embed_model, persist_directory=persistent_dir )
            
            #To enable embeddings running on GPU: embedding and ingesting at the same time
            vector_db = Chroma.from_documents([d],hf_embed_model, persist_directory=persistent_dir)
        pbar.update(1)    

### Test B Create index with RecursiveCharacterTextSplitter methodology
We read a flat file (.txt) and data will be read as parapgraphs.

Is the process of breaking down a large input text into smaller pieces.
This ensures that the text fits the input size of the embedding model and improves retrieval efficiency.

Chunk size is one of the key hyperparameters on any LLM project

![image.png](attachment:image.png)

In [None]:
# Define the persistent directory containing the VectorDB
script_dir = os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir, '..', 'index', 'TestB'))

# Step 1: Read the .txt file
file_path = '..\data\hs_code_dictionary.txt'  # Path to your .txt file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to extract metadata (HS code) and content
def extract_metadata_and_content(line):
    match = re.match(r"(\d{4}\.\d{2})\s*(.*)", line)
    if match:
        metadata = match.group(1)  # The number part as metadata
        content = match.group(2)   # The rest of the line as content
        return metadata, content
    return None, line  # In case no match, return the line as-is

# Step 2: Split the text into lines first
lines = text.split("\n")

# Initialize the RecursiveCharacterTextSplitter
chunk_size = 200  # Define your chunk size
chunk_overlap = 20  # Set to 0 if you don't want overlapping chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Step 3: Process each line, extract metadata, split content, and create Document objects
documents = []
for line in lines:
    metadata, content = extract_metadata_and_content(line)
    
    if metadata:  # If we found valid metadata
        # Split the content into chunks
        chunks = text_splitter.split_text(content)
        
        # Create a Document object for each chunk, with metadata containing the HS code
        for chunk in chunks:
            document = Document(page_content=chunk, metadata={"hs_code": metadata, "source": file_path})
            documents.append(document)

# Step 4: Embed the documents into the vector database
vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            # When no GPU is available, initialize vector_db
            #vector_db = Chroma.from_documents([d], embed_model, persist_directory=persistent_dir)
            
            # To enable embeddings running on GPU, ingest documents and create embeddings
            vector_db = Chroma.from_documents([d], hf_embed_model, persist_directory=persistent_dir)
        pbar.update(1)

# The 'documents' list now contains Document objects with metadata and chunks, indexed into the vector database


### TestC - Same as TestB, but chunksize of 50 (very short) and no overlap

In [None]:
# Define the persistent directory containing the VectorDB
script_dir = os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir, '..', 'index', 'TestC'))

# Step 1: Read the .txt file
file_path = '..\data\hs_code_dictionary.txt'  # Path to your .txt file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to extract metadata (HS code) and content
def extract_metadata_and_content(line):
    match = re.match(r"(\d{4}\.\d{2})\s*(.*)", line)
    if match:
        metadata = match.group(1)  # The number part as metadata
        content = match.group(2)   # The rest of the line as content
        return metadata, content
    return None, line  # In case no match, return the line as-is

# Step 2: Split the text into lines first
lines = text.split("\n")

# Initialize the RecursiveCharacterTextSplitter
chunk_size = 50  # Define your chunk size
chunk_overlap = 0  # Set to 0 if you don't want overlapping chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Step 3: Process each line, extract metadata, split content, and create Document objects
documents = []
for line in lines:
    metadata, content = extract_metadata_and_content(line)
    
    if metadata:  # If we found valid metadata
        # Split the content into chunks
        chunks = text_splitter.split_text(content)
        
        # Create a Document object for each chunk, with metadata containing the HS code
        for chunk in chunks:
            document = Document(page_content=chunk, metadata={"hs_code": metadata, "source": file_path})
            documents.append(document)

# Step 4: Embed the documents into the vector database
vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            # When no GPU is available, initialize vector_db
            #vector_db = Chroma.from_documents([d], embed_model, persist_directory=persistent_dir)
            
            # To enable embeddings running on GPU, ingest documents and create embeddings
            vector_db = Chroma.from_documents([d], hf_embed_model, persist_directory=persistent_dir)
        pbar.update(1)

# The 'documents' list now contains Document objects with metadata and chunks, indexed into the vector database


### TestD - OpenAI Embeddings

One row per chunk: Each HTS code and its corresponding description should form a single chunk. This ensures that the retrieval component of RAG can focus on fetching the most relevant HTS code and description without unnecessary confusion.

For example, a chunk would be:

Chunk 1: "0101.21 Pure-bred breeding horses"
Chunk 2: "0101.29 Live horses (excluding pure-bred for breeding)"
Token-wise: Since each HTS code and description is relatively short (around 10-20 words), each chunk should contain fewer than 50 tokens, which is well within the limits of most transformer models.

In [4]:
# Define the persistent directory containing the VectorDB
script_dir =  os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir,'..' ,'index', 'TestD'))

# Importante Leer columna de codigo como string, sino se eliminan los zeros a la izquierda.
df = pd.read_csv('..\data\WebScrap_CSVs\hts_codes_WebScrapped.csv', encoding='utf-8', dtype={'HTS code': str})

# Split data and metadata
texts = df['Description'].tolist()  # This is the text data that will be embedded
metadata = df['HTS code'].tolist()  # This is the metadata that will be stored alongside the embeddings

chroma_db = Chroma(
    collection_name="csv_collection",  # Name for the Chroma collection
    embedding_function=hf_embed_model.embed_query,  # Function for query embeddings
    persist_directory=persistent_dir
)

documents = []
for i, text in enumerate(texts):
    document = Document(page_content=text, metadata={"source":metadata[i]})
    documents.append(document)

# Debug to check metadata + text
print(documents[0])
print(documents[1])    
print("Amount of documents is: " , len(documents))   

vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            #When no GPU is available
            #vector_db = Chroma.from_documents([d],embed_model, persist_directory=persistent_dir )
            
            #To enable embeddings running on GPU: embedding and ingesting at the same time
            vector_db = Chroma.from_documents([d],open_ai_embed_model, persist_directory=persistent_dir)
        pbar.update(1)    

page_content='Pure-bred breeding horses' metadata={'source': '0101.21'}
page_content='Live horses (excluding pure-bred for breeding)' metadata={'source': '0101.29'}
Amount of documents is:  5541


100%|██████████| 1/1 [00:00<00:00,  1.79it/s]1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  3.77it/s]1 [00:00<1:07:57,  1.36it/s]
100%|██████████| 1/1 [00:00<00:00,  2.47it/s]1 [00:01<43:11,  2.14it/s]  
100%|██████████| 1/1 [00:00<00:00,  3.33it/s]1 [00:01<40:55,  2.26it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]1 [00:01<36:15,  2.54it/s]
100%|██████████| 1/1 [00:00<00:00,  1.42it/s]1 [00:02<31:51,  2.90it/s]
100%|██████████| 1/1 [00:00<00:00,  3.16it/s]1 [00:02<43:28,  2.12it/s]
100%|██████████| 1/1 [00:00<00:00,  3.95it/s]1 [00:03<39:07,  2.36it/s]
100%|██████████| 1/1 [00:00<00:00,  5.31it/s]1 [00:03<34:28,  2.67it/s]
100%|██████████| 1/1 [00:00<00:00,  7.06it/s]1 [00:03<29:23,  3.14it/s]
100%|██████████| 1/1 [00:00<00:00,  4.23it/s]41 [00:03<24:34,  3.75it/s]
100%|██████████| 1/1 [00:00<00:00,  3.73it/s]41 [00:03<24:01,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.80it/s]41 [00:04<24:30,  3.76it/s]
100%|██████████| 1/1 [00:00<00:00,  3.89it/s]41 [00:04<24:43,  3.