# Overview

The Semantic Router algorithm converts all the training texts to embeddings using any off-the-shelf embedding models like Sentence Transformer. During inference, it converts the input texts to embeddings and finds the K Nearest Neighbots from the training data embeddings to classify the input text.

In [None]:
!pip install -q -U transformers==4.39.3
!pip install -q -U datasets==2.18.0
!pip install -q -U semantic-router==0.0.44

In [None]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["MODEL"]="sentence-transformers/paraphrase-mpnet-base-v2"
os.environ["DATASET"]="SetFit/amazon_massive_intent_en-US"
os.environ["FITMODEL"]="semantic-router-mpnet-v2-amazon-mi"

# Loading dataset

In [None]:
from datasets import Dataset, load_dataset

ds=load_dataset(os.getenv("DATASET"))

# Pre-processing dataset

In [None]:
import pandas as pd

df=pd.DataFrame(ds["test"])

# Helper function to select random rows
def select_random_rows(group):
    return group.sample(n=10, random_state=42)


# find top classes with minimum 30 rows
label_counts=df["label_text"].value_counts()
label_counts=label_counts[label_counts>30]

# restruct df to top classes only
df=df[df["label_text"].isin(label_counts.index)].reset_index(drop=True)

assert set(df["label_text"].value_counts().index.to_list())==set(label_counts.index.to_list()), "Some labels were lost"

# select random row per unique value in label_text column
train_df=df.groupby("label_text", group_keys=False).apply(select_random_rows)

# create eval dataframe by dropping the train data and selecting random rows
eval_df=(df.drop(train_df.index).groupby("label_text",group_keys=False).apply(select_random_rows))

# create test dataframe by dropping both train and eval data
test_df=df.drop(train_df.index.to_list()+eval_df.index.to_list())

# reset the index
cols_to_keep=["text", "label_text"]
train_df=train_df[cols_to_keep].reset_index(drop=True)
eval_df=eval_df[cols_to_keep].reset_index(drop=True)
test_df=test_df[cols_to_keep].reset_index(drop=True)

# save the file
test_df.to_pickle("test_df.pkl")
train_df.to_pickle("train_df.pkl")
eval_df.to_pickle("eval_df.pkl")

train_ds=Dataset.from_pandas(train_df)
eval_ds=Dataset.from_pandas(eval_df)
test_ds=Dataset.from_pandas(test_df)

print(train_df.shape, eval_df.shape, test_df.shape)
train_df.head()

# Training

In [None]:
from semantic_router import Route
from smentic_router.encoders import HuggingFaceEncoder
from smentic_router.layer import RouteLayer

routes=[]

for topic train_df["label_text"].unique():
    name=topic
    utterances = train_df[train_df["label_text"] == topic]["text"].values.tolist()
    route = Route(name=name, utterances=utterances)
    routes.append(route)

# Step 2: Setup the Embedding model
encoder = HuggingFaceEncoder(name="sentence-transformers/all-MiniLM-L6-v2")

# Step 3: Create the route layer to embed all utterences and assign their class
rl = RouteLayer(encoder=encoder, routes=routes)   

# Inference

In [None]:
from sklearn.metrics import classification_report
from tdqm import tdqm

# Step 1: Run predictions to calculate class level metrics
predictions = []
for text in tqdm(test_df['text'].values):
    pred = rl(text).name

    if pred:
        predictions.append(pred)
    else:
        predictions.append("")

test_df["semantic_router_predictions"] = predictions
print("\nSemantic Router Class Level Metrics:")
print(
    classification_report(test_df["label_text"], test_df["semantic_router_predictions"])
)

# Acknowledge

* https://medium.com/towards-artificial-intelligence/few-shot-nlp-intent-classification-d29bf85548aa
* https://github.com/aurelio-labs/semantic-router/tree/main