In [1]:
import pandas as pd

# Load the .txt file into a DataFrame
data = pd.read_csv('..\data\hs_code_dictionary_extended.txt', delimiter='\t', dtype=str)

# Extract descriptions and HS codes
descriptions = data['Description'].tolist()
hs_codes = data['HTS code'].tolist()


In [2]:
from sentence_transformers import SentenceTransformer
import chromadb

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create embeddings for each description
description_embeddings = model.encode(descriptions, convert_to_tensor=True)

# Convert to list format for Chroma
description_embeddings_list = description_embeddings.cpu().detach().numpy().tolist()

# Initialize Chroma
client = chromadb.Client()

# Create or connect to a collection
collection = client.create_collection("hs_codes_collection")

# Add data to Chroma (HS Code descriptions + corresponding HS Codes as metadata)
for idx, (desc, code) in enumerate(zip(descriptions, hs_codes)):
    collection.add(
        embeddings=[description_embeddings_list[idx]],
        metadatas=[{"HTS code": code, "Description": desc}],
        ids=[str(idx)]  # Use index as ID
    )


def search_hs_code_chroma(query, top_k=3):
    # Encode the query into an embedding
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().detach().numpy().tolist()

    # Query Chroma for the top K results
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )

    # Extract relevant data from the search results
    retrieved_data = []
    for result in results['documents'][0]:
        desc = result['Description']
        hs_code = result['HS Code']
        retrieved_data.append((desc, hs_code))

    return retrieved_data

# Example query
query = "fresh apples"
results = search_hs_code_chroma(query)

for description, hs_code in results:
    print(f"Description: {description}, HS Code: {hs_code}")


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
from transformers import pipeline

# Load a pre-trained text generation model
generator = pipeline('text-generation', model='gpt2')

def generate_response_chroma(query):
    # Perform semantic search to retrieve the most relevant HS Code descriptions
    retrieved_info = search_hs_code_chroma(query)
    
    # Format retrieved results
    retrieved_text = "\n".join([f"{desc} (HS Code: {hs_code})" for desc, hs_code in retrieved_info])
    
    # Combine query with retrieved information
    input_text = f"Query: {query}\nRelevant HS Codes and Descriptions:\n{retrieved_text}\nResponse:"
    
    # Generate a response
    response = generator(input_text, max_length=100, num_return_sequences=1)[0]['generated_text']
    
    return response

# Example query
query = "I want to import fresh apples"
response = generate_response_chroma(query)

print(response)


In [3]:
from transformers import pipeline

# Load a pre-trained text generation model
generator = pipeline('text-generation', model='gpt2')

def generate_response_chroma(query):
    # Perform semantic search to retrieve the most relevant HS Code descriptions
    retrieved_info = search_hs_code_chroma(query)
    
    # Format retrieved results
    retrieved_text = "\n".join([f"{desc} (HS Code: {hs_code})" for desc, hs_code in retrieved_info])
    
    # Combine query with retrieved information
    input_text = f"Query: {query}\nRelevant HS Codes and Descriptions:\n{retrieved_text}\nResponse:"
    
    # Generate a response
    response = generator(input_text, max_length=100, num_return_sequences=1)[0]['generated_text']
    
    return response

# Example query
query = "I want to import fresh apples"
response = generate_response_chroma(query)

print(response)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


TypeError: 'NoneType' object is not subscriptable