In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_huggingface import HuggingFaceEmbeddings
from enum import Enum

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class TaxonomyChatType(Enum):
    SMALL_TALK = "small_talk"
    TAXONOMY_FROM_SPECIES = "taxonomy_from_species"
    TAXONOMY_FROM_FEATURES = "taxonomy_from_features"

In [4]:
routes = {
    TaxonomyChatType.SMALL_TALK: [
        "Hello there", 
        "Hi", 
        "How are you today?", 
        "Who are you?", 
        "Thanks for the help", 
        "Good morning", 
        "What's up?", 
        "How's it going?", 
        "Nice to meet you", 
        "Tell me a joke", 
        "What's your favorite color?", 
        "Do you like animals?", 
        "What's your favorite marine animal?"
    ],
    
    TaxonomyChatType.TAXONOMY_FROM_SPECIES: [
        "Tell me about this species", 
        "Identify this worm", 
        "What is the scientific name of the clownfish?", 
        "Which species has a blue ring on its body?", 
        "Give me information on the blue whale", 
        "What do you know about the hammerhead shark?", 
        "Can you identify the tiger shark?", 
        "What species is known for its bioluminescence?", 
        "Tell me about the great white shark", 
        "Describe the common dolphin", 
        "What is the size of a sea turtle?", 
        "Which fish is commonly found in coral reefs?"
    ],
    
    TaxonomyChatType.TAXONOMY_FROM_FEATURES: [
        "Which fish has 8 spines?", 
        "What species has a long, thin body and sharp fins?", 
        "Identify the fish with a red body and yellow tail", 
        "Which species has a triangular dorsal fin?", 
        "Describe a fish with a bulbous body and small pectoral fins", 
        "Which animal has bioluminescent cells in its body?", 
        "What marine animal has a shell and tentacles?", 
        "What species has large blue spots on its body?", 
        "Can you identify a fish with a flat body and sharp teeth?",  
        "Describe a creature with two long, curved antennae", 
        "What species is characterized by bioluminescent cells and a transparent body?",
        "Find the animal with a spiral shell and two long antennae",
        "Guess the species of scale worm with an oval body, 30mm long, with 15 pairs of elytra and bifurcate neurosetae?",
        "Identify a creature that is 40-50mm long, has a depressed body, smooth prostomium, and lives in the Northern Atlantic.",
        "I am looking for a worm with slender median antenna, swollen ommatophores, and no fringe of hair on the setae.",
        "Which polychaete has notosetae on clytrigerous feet and distinct harpoon-setae?"
        "Guess the species of scale worm with an oval, depressed body 30 40 mm long with 32 34 setigers, small rounded prostomium with slender median antenna from a stout ceratophore, a pair of swollen ommatophores each bearing two eye spots, 15 pairs of smooth elytra, notosetae on clytrigerous feet with long harpoon-setae, sabre-setae and a few capillaries, smaller notosetae on cirrigerous feet lacking harpoon-setae, bifurcate, Mediterranean, Red Sea, Indian Ocean, to Japan?"
    ]
}


In [9]:
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)


In [10]:
route_vectors = {
    intent: np.array(embeddings.embed_documents(examples))
    for intent, examples in routes.items()
}

In [11]:
def get_routing_intent(query):
    query_vec = np.array(embeddings.embed_query(query)).reshape(1, -1)
    best_intent = "taxonomy_from_species" # Default fallback
    highest_score = 0
    
    for intent, vectors in route_vectors.items():
        # Calculate similarity between query and all examples in this route
        scores = cosine_similarity(query_vec, vectors)
        max_score = np.max(scores)
        if max_score > highest_score:
            highest_score = max_score
            best_intent = intent
            
    # If the match is very weak (under 0.4), treat it as a general query
    return best_intent if highest_score > 0.4 else "taxonomy_from_species"

In [16]:
question = "Guess the species of scale worm with an oval, depressed body 30 40 mm long with 32 34 setigers, small rounded prostomium with slender median antenna from a stout ceratophore, a pair of swollen ommatophores each bearing two eye spots, 15 pairs of smooth elytra, notosetae on clytrigerous feet with long harpoon-setae, sabre-setae and a few capillaries, smaller notosetae on cirrigerous feet lacking harpoon-setae, bifurcate neurosetae with 1 4 accessory teeth and no fringe of hair, distributed from Northern Atlantic (Scotland to Senegal), Mediterranean, Red Sea, Indian Ocean, to Japan?"

intent = get_routing_intent(question)
print(intent)

taxonomy_from_features


In [12]:
import os
os.path.exists(r"../../backend/vectorstore")

True

In [13]:
import numpy as np

ROUTING_VECTORS_PATH = r"/home/abk/abk/projects/Major-project-basic-ui/backend/vectorstore/chat_type_detection_embed.npz"

route_vectors = {
    intent.value: np.array(embeddings.embed_documents(examples))
    for intent, examples in routes.items()
}

np.savez_compressed(ROUTING_VECTORS_PATH, **route_vectors)


In [23]:
#CHECKING
def TESTING():
    with np.load(ROUTING_VECTORS_PATH) as data:
        return {TaxonomyChatType(intent): data[intent] for intent in data.files}

test_vectors =  TESTING()

def get_routing_intent_TESTING(query):
    query_vec = np.array(embeddings.embed_query(query)).reshape(1, -1)
    best_intent = "taxonomy_from_species" # Default fallback
    highest_score = 0
    
    for intent, vectors in test_vectors.items():
        # Calculate similarity between query and all examples in this route
        scores = cosine_similarity(query_vec, vectors)
        max_score = np.max(scores)
        if max_score > highest_score:
            highest_score = max_score
            best_intent = intent
            
    # If the match is very weak (under 0.4), treat it as a general query
    return best_intent if highest_score > 0.4 else "taxonomy_from_species"

In [24]:
print(get_routing_intent_TESTING(question))

TaxonomyChatType.TAXONOMY_FROM_FEATURES
