In [5]:
import os
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_openai  import OpenAIEmbeddings

## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/
from InstructorEmbedding import INSTRUCTOR
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import warnings, re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer

# Suppress all warnings
warnings.filterwarnings("ignore")

### Set up embedding model to use with GPU

In [6]:
import torch
import torch.nn as nn

# Check if CUDA is available
print(f"CUDA Available: {torch.cuda.is_available()}")

# Print CUDA device name
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3050


### Embedding model definition
A technique for representing text data as numerical vectors, which can be input into machine learning models. The embedding model is responsible for converting text into these vectors.

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", load_in_4bit=True)
#model = AutoModelForCausalLM.from_pretrained("intfloat/e5-mistral-7b-instruct", load_in_4bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Downloading shards:  50%|█████     | 1/2 [25:23<25:23, 1523.97s/it]Error while downloading from https://cdn-lfs.hf.co/repos/ea/00/ea00943d992c7851ad9f4f4bd094a0397fb5087e0f7cba4ef003018963ea07e3/5734c77cf2ff6482713d474ee9c8791cd712b59dd36b05d096af5a4cb41f3f02?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00002-of-00002.safetensors%3B+filename%3D%22model-00002-of-00002.safetensors%22%3B&Expires=1730184903&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDE4NDkwM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9lYS8wMC9lYTAwOTQzZDk5MmM3ODUxYWQ5ZjRmNGJkMDk0YTAzOTdmYjUwODdlMGY3Y2JhNGVmMDAzMDE4OTYzZWEwN2UzLzU3MzRjNzdjZjJmZjY0ODI3MTNkNDc0ZWU5Yzg3OTFjZDcxMmI1OW

In [13]:
#from langchain_community.embeddings import SentenceTransformerEmbeddings
#
#model_embeddings = SentenceTransformerEmbeddings(model_name="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"trust_remote_code":True}) 


In [14]:
# WARNING! :Only runs with this version
###### !pip install sentence-transformers==2.2.2  ######
#Define the sentence-transformer model:

#For English
#sentence-transformers/LaBSE
#embed_model = "sentence-transformers/all-MiniLM-L6-v2"
#embed_model = "sentence-transformers/all-mpnet-base-v2"
#embed_model = "intfloat/e5-mistral-7b-instruct"
#embed_model = "Jaume/gemma-2b-embeddings"


#For Spanish 
#projecte-aina/aguila-7b
#embed_model = "hiiamsid/sentence_similarity_spanish_es"

#Other sentence-transformer settings
model_kwargs = {'device': 'cuda:0'}  # specify GPU device
encode_kwargs = {'normalize_embeddings': True}


# Initialize the embedding model
try:
    hf_embed_model = SentenceTransformer(model, device=model_kwargs['device'])
    print("Embedding model loaded successfully!")
except Exception as e:
    print(f"Error loading embedding model: {e}")
    
#hf_embed_model = HuggingFaceInstructEmbeddings(
#    model_name=embed_model,
#    model_kwargs=model_kwargs,
#    encode_kwargs=encode_kwargs
#)

# Load environment variables from the .env file
#load_dotenv()
# Initialize OpenAI client with the API key from environment variables
#open_ai_embed_model = OpenAIEmbeddings(openai_api_key= os.environ["OPENAI_API_KEY"],
#             model="text-embedding-3-large", 
#             max_retries=1000,
#             request_timeout=8,
#             retry_min_seconds=4,
#             show_progress_bar=True,
#             )

Error loading embedding model: stat: path should be string, bytes, os.PathLike or integer, not MistralForCausalLM


### TestF extended dictionary

This HS Code dicitonary was scrapped from 2 different sites to increase redundancy and accuracy.

PDFs scrapped from the web - https://www.wcoomd.org/en/topics/nomenclature/instrument-and-tools/hs-nomenclature-2022-edition/hs-nomenclature-2022-edition.aspx

Definitions on WebSite - https://www.dripcapital.com/hts-code/

In [10]:
# Define the persistent directory containing the VectorDB
script_dir =  os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir,'..' ,'index', 'gemma-2b-table'))

# Importante Leer columna de codigo como string, sino se eliminan los zeros a la izquierda.
df = pd.read_csv('..\data\WebScrap_CSVs\hts_codes_WebScrapped.csv', encoding='utf-8', dtype={'HTS code': str})

# Split data and metadata
texts = df['Description'].tolist()  # This is the text data that will be embedded
metadata = df['HTS code'].tolist()  # This is the metadata that will be stored alongside the embeddings

documents = []
for i, text in enumerate(texts):
    document = Document(page_content=text, metadata={"source": metadata[i]})
    documents.append(document)

# Debug to check metadata + text
print(documents[0])
print(documents[1])    
print("Amount of documents is: " , len(documents))   

vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            #When no GPU is available
            #vector_db = Chroma.from_documents([d],embed_model, persist_directory=persistent_dir )
            
            #To enable embeddings running on GPU: embedding and ingesting at the same time
            vector_db = Chroma.from_documents([d],model, persist_directory=persistent_dir)
        pbar.update(1)    

page_content='Pure-bred breeding horses' metadata={'source': '0101.21'}
page_content='Live horses (excluding pure-bred for breeding)' metadata={'source': '0101.29'}
Amount of documents is:  5541


Creando embeddings...:   0%|          | 0/5541 [00:00<?, ?it/s]


AttributeError: 'MistralForCausalLM' object has no attribute 'embed_documents'