# Approximate Nearest Neighbors for Text Classification
Author: [Collin Zoeller](https://www.linkedin.com/in/collinzoeller)
<br> Carnegie Mellon University

This notebook demonstrates how to use the ANNOY library for fast approximate nearest neighbor search to classify text data. The goal is to classify user-generated text data into pre-defined categories using a pre-trained transformer model. The ANNOY library is used to build an approximate nearest neighbor index for the target classes, and then to classify new observations based on their nearest neighbors in the embedding space.


Imports

In [None]:
!pip install -q sentence-transformers
!pip install -q annoy
!pip install -q kagglehub

In [1]:
import random
import time
import pandas as pd
import numpy as np
import os
import shutil
from glob import glob
from collections import Counter
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import kagglehub


  from .autonotebook import tqdm as notebook_tqdm


## Data

Use 100k samples of previously scraped data to train and evaluate the model. produce Synthetic data by adding noise to the scraped data.


REDDIT DATA
- Data retrieved from: https://www.reddit.com/r/datasets/comments/w340kj/dataset_of_job_descriptions_for_your_pleasure/
- Data hosted at: https://drive.google.com/drive/folders/1XxNuhiei5taFR6gziofYAx0oWfGeV7y9

KAGGLE DATA
- SOURCE: https://www.kaggle.com/datasets/jatinchawda/job-titles-and-description


In [None]:
def create_training_data(num_obs=None):
    """Create training data from scraped data."""
    df = pd.read_parquet('data/kaggle_clean_data.parquet')
    df = df.rename(columns={"job_title": "title"})
    df1 = pd.read_parquet('data/reddit_data.parquet')
    df = pd.concat([df, df1])
    df = clean_data(df)

    if num_obs:
        df = df.sample(num_obs)

    return df


def clean_data(data:np.array):
    """
    Standardize data for english ASCII-only characters.
    :param data:
    :return:
    """
    df = pd.DataFrame(data, columns=['X'])

   # non-ascii
    df['X'] = df['X'].str.encode('ascii', 'ignore').str.decode('ascii')

    # remove empty strings
    df = df[df['X'] != ""]

    return df.X.to_numpy()


def noisify(truevals: np.array):
    """Add noise to the data."""

    df = pd.DataFrame(truevals, columns=['X'])
    if random.random() < 0.3:
        df['X'] = df['X'].str.upper()
    if random.random() > 0.5:
        df['X'] = df['X'].str.lower()
    if random.random() < 0.5:
        df['X'] = df['X'].apply(lambda x: x + " " + random.choice(["Senior", "Junior", "Lead"]))
    if random.random() < 0.3:
        df['X'] = df['X'].apply(lambda x: "".join(list(x).pop(random.randint(0, len(x) - 1))))
    if random.random() < 0.1:
        df['X'] = df['X'].str[::-1]
    if random.random() < 0.2:
        df['X'] = df['X'].apply(lambda x: x.replace(" ", random.choice(["_", "-", ""])))
    if random.random() < 0.2:
        df['X'] = df['X'].apply(lambda x: x + str(random.randint(0, 99)))
    if random.random() < 0.2:
        df['X'] = df['X'].apply(lambda x: x[:random.randint(1, len(x))])
    return df.X.to_numpy()


def make_random_data(classes, num_obs=1000):
    """Create random data with noise."""
    true_values = np.random.choice(classes, num_obs)
    noisy_data = noisify(true_values)
    return true_values, noisy_data

### Download the Kaggle dataset

In [None]:
if not os.path.exists("data/kaggle_clean_data.parquet"):
    os.makedirs("data", exist_ok=True)
    path = kagglehub.dataset_download("jatinchawda/job-titles-and-description")
    print("Path to dataset files:", path)
    shutil.move(f"{path}/clean_data.parquet", "data/kaggle_clean_data.parquet")

    # Save only the title column
    df = pd.read_parquet("data/kaggle_clean_data.parquet", columns=["job_title"])
    df.to_parquet("data/kaggle_clean_data.parquet")

else:
    print("Kaggle data already downloaded.")

### Format the Reddit dataset
This data should already be downloaded from https://drive.google.com/drive/folders/1XxNuhiei5taFR6gziofYAx0oWfGeV7y9 and saved as data/reddit_jobs.

In [None]:

if not os.path.exists("data/reddit_data.parquet"):

    if not os.path.exists("data/reddit_jobs"):
        raise FileNotFoundError("Download the Reddit dataset from the drive at"
                                " https://drive.google.com/drive/folders/1XxNuhiei5taFR6gziofYAx0oWfGeV7y9 ."
                                "\nSave as data/reddit_jobs.")

    print("Formatting Reddit data...")
    df = pd.concat([pd.read_csv(file) for file in glob(f"data/reddit_jobs/*.csv")])
    df=df['title'].to_frame()
    df.to_parquet("data/reddit_data.parquet")
    print("Reddit data saved to data/reddit_data.parquet")
else:
    print("Reddit data already formatted correctly.")

## Modules

Functions and tools


In [None]:
"""
Functions and tools for the occupation classification project.
"""
def embed_batched(model, data, batch_size=1000):
    """embeddings in batches."""

    num_batches = (len(data) + batch_size - 1) // batch_size
    batch_indices = np.array_split(np.arange(len(data)), num_batches)

    embeds = np.vstack([model.encode(data[indices], convert_to_numpy=True, show_progress_bar=False) for indices in batch_indices])

    return embeds


def build_tree(embeddings, num_trees=10):
    """Build an Annoy index for the given embeddings."""
    t = AnnoyIndex(embeddings.shape[1], 'euclidean')
    for i, emb in enumerate(embeddings):
        t.add_item(i, emb)
    t.build(num_trees)
    print("Annoy index built with", num_trees, "trees.")
    return t


def predict_labels(tree, classes, embeddings, neighbors=1):
    """Predict the nearest neighbor labels for the given embeddings."""
    # Retrieve neighbor indices for each embedding
    indices = [tree.get_nns_by_vector(emb.tolist(), neighbors) for emb in embeddings]

    # Map indices to their corresponding class labels
    neighbor_labels = [[classes[idx] for idx in idxs] for idxs in indices]

    # Determine the most common label among neighbors
    most_common_labels = [Counter(labels).most_common(1)[0][0] for labels in neighbor_labels]

    return np.array(most_common_labels)


def evaluate(y_true, y_pred):
    """Evaluate the classification performance."""
    comp = pd.DataFrame({"label": y_true, "yhat": y_pred})
    comp['correct'] = comp['label'] == comp['yhat']
    accuracy = comp['correct'].mean()
    precision = comp.groupby('yhat')['correct'].mean().mean()
    recall = comp.groupby('label')['correct'].mean().mean()
    f1 = 2 * (precision * recall) / (precision + recall)

    print(f"Accuracy: {accuracy:.4f}"
          f"\nPrecision: {precision:.4f}"
          f"\nRecall: {recall:.4f}"
          f"\nF1: {f1:.4f}")

    report = classification_report(y_true, y_pred, output_dict=True)
    pd.DataFrame(report).T.to_csv("report.csv")
    return


def visualize(x_embeddings, x_labels, class_embeddings, class_labels, save: bool = False):
    """Visualize the embeddings using PCA."""
    x_embeddings = np.vstack(x_embeddings)
    x_labels = np.array(x_labels)
    class_embeddings = np.vstack(class_embeddings)
    class_labels = np.array(class_labels)

    pca = PCA(n_components=2)
    pca_embed = pca.fit_transform(x_embeddings)
    pca_class_embed = pca.transform(class_embeddings)

    unique_classes, counts = np.unique(x_labels, return_counts=True)
    top_20_indices = np.argsort(-counts)[:20]
    top_5_indices = np.argsort(-counts)[:5]
    unique_classes_20 = unique_classes[top_20_indices]
    unique_classes_5 = unique_classes[top_5_indices]

    colors = plt.colormaps['tab20']
    class_to_color = {cls: colors(i) for i, cls in enumerate(unique_classes_20)}

    plt.figure(figsize=(10, 10))

    # plot input embeddings
    for cls in unique_classes_20:
        cls_idx = np.where(x_labels == cls)[0]
        print(f"Class: {cls}, Count: {len(cls_idx)}")
        plt.scatter(pca_embed[cls_idx, 0], pca_embed[cls_idx, 1],
                    color=class_to_color[cls], alpha=0.6, marker='o')

    # Label classes
    for cls in unique_classes_20:
        cls_idx = np.where(class_labels == cls)[0]
        plt.scatter(pca_class_embed[cls_idx, 0], pca_class_embed[cls_idx, 1],
                    color=class_to_color[cls], label=f"{cls}", alpha=1.0, marker='x', s=250)
        # for idx in cls_idx:
        #     plt.text(pca_class_embed[idx, 0], pca_class_embed[idx, 1], cls, fontsize=9, ha='center', va='bottom')

    plt.title("Embedding Clusters in 2D: Top 20 Occupations")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.show()
    if save:
        plt.savefig("embeddings.png")
    return


def pipeline(model: str, labels: np.array, data: np.array, num_trees: int, num_neighbors: int, batch_size: int, save_fig: bool = False):

    # 1. Load pre-trained model
    model = SentenceTransformer(model)

    # 2. Encode target classes
    enc_start = time.time()
    print(f"Encoding {len(labels)} target label values")
    target_embeddings = model.encode(labels, convert_to_numpy=True, show_progress_bar=True)
    print(f"(finished in {time.time() - enc_start:.2f} seconds, avg: {(time.time() - enc_start) / len(labels):.4f} sec/label)")

    # 3. Build Annoy Index for target classes
    tree_start = time.time()
    print(f"Building Annoy index with {num_trees} trees")
    tree = build_tree(target_embeddings, num_trees=num_trees)
    print(f"(finished in {time.time() - tree_start:.2f} seconds, avg: {(time.time() - tree_start) / len(labels):.4f} sec/label)")

    # 4. Encode feature space and classify
    features_start = time.time()
    print(f"Encoding {len(data)} feature vectors")
    feature_embeddings = embed_batched(model, data, batch_size=batch_size)
    print(f"(finished in {time.time() - features_start:.2f} seconds, avg: {(time.time() - features_start) / len(data):.4f} sec/label)")

    # 5. Predict labels
    pred_start = time.time()
    print("Predicting labels")
    yhat = predict_labels(tree, labels, feature_embeddings, neighbors=num_neighbors)
    print(f"(finished in {time.time() - pred_start:.2f} seconds, avg: {(time.time() - pred_start) / len(data):.4f} sec/label)")

    # 6. Visualize
    visualize(feature_embeddings, yhat, target_embeddings, labels, save=save_fig)

    return yhat


## Pipeline

### 1. Load pre-trained model and Output Data

In [None]:
modelname ="all-MiniLM-L6-v2"
model = SentenceTransformer(modelname)

In [3]:
# Output Labels from Dingle and Neiman
labels = pd.read_csv('https://raw.githubusercontent.com/jdingel/DingelNeiman-workathome/master/occ_onet_scores/output/occupations_workathome.csv')
labels.head()
labels = labels['title'].to_numpy()

Unnamed: 0,onetsoccode,title,teleworkable
0,11-1011.00,Chief Executives,1
1,11-1011.03,Chief Sustainability Officers,1
2,11-1021.00,General and Operations Managers,1
3,11-2011.00,Advertising and Promotions Managers,1
4,11-2021.00,Marketing Managers,1


### 2. Encode target classes

In [None]:
print(f"Encoding {len(labels)} target label values")
target_embeddings = model.encode(labels, convert_to_numpy=True, show_progress_bar=True)

## 3. Build Annoy Index for target classes
The hyperparameter at training is the number of random trees to build. The more trees, the more accurate the search, but the longer it takes to build the index.

In [None]:
num_trees = 500
tree = build_tree(target_embeddings, num_trees=num_trees)

## 4. Encode feature space
Batch_size determines the number of observations to embed at once so to avoid memory issues. While higher batch sizes are faster, they may not fit in memory.

Data here may be unlabeled (such as the Kaggle or Reddit data), but it does not say much for the model's performance. Consider creating labeled data for evaluation. For demonstration purposes, we create randomized data from the output labels by adding noise.

In [None]:
## Use this block to use real labeled data
data = create_training_data(num_obs=100) # samples 100 observations from the scraped data

In [None]:
# Assuming the data is labeled and saved to the csv file
data = pd.read_csv("data/training_data.csv")
y = data['label'].to_numpy()

In [None]:
# Use this block to use the synthetic data
y, data = make_random_data(labels, num_obs=100000)

In [None]:

batch_size = 1024
number_of_obs= 100000 # The number of observations to generate

feature_embeddings = embed_batched(model, data, batch_size=batch_size)

## 5. Predict labels
num_neighbors is the number of nearest neighbors to consider when classifying the data, equivalent to the k in KNN.

In [None]:
num_neighbors = 10
yhat = predict_labels(tree, labels, feature_embeddings, neighbors=num_neighbors)

## 6. Visualize
Create a 2d plot of the top 20 most common occupations in the sample. The plot shows the distribution of the embeddings in the feature space, and how they are clustered. Each point represents an observation, and the color represents the assigned class label. The X's represent the target classes.

In [None]:
# prints the top 20 occupations
visualize(feature_embeddings, yhat, target_embeddings, labels, save=False)

##  7. Evaluate

In [None]:
evaluate(y, yhat)