# Classification with Embeddings

Documentation: https://sbert.net/index.html

- lDA/ topic modeling/clustering 

## Imports

In [1]:
import json
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import torch.nn.functional as F
from transformers import AutoModel

import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import json
import os
from sentence_transformers import SentenceTransformer
from PIL import Image

## Sentence transformer comparison

In [2]:
categories = ["Romance","Science Fiction","Dystopian","Thriller","Historical Fiction","Drama","Mystery","Fantasy"]

models = [
    "all-mpnet-base-v2",
    "gtr-t5-xxl",
    "gtr-t5-xl",
    "sentence-t5-xxl",
    "gtr-t5-large",
    "all-mpnet-base-v1",
    "multi-qa-mpnet-base-dot-v1",
    "multi-qa-mpnet-base-cos-v1",
    "all-roberta-large-v1",
    "sentence-t5-xl",
    "all-distilroberta-v1",
    "all-MiniLM-L12-v1",
    "all-MiniLM-L12-v2",
    "multi-qa-distilbert-dot-v1",
    "multi-qa-distilbert-cos-v1",
    "gtr-t5-base",
    "sentence-t5-large",
    "all-MiniLM-L6-v2",
    "multi-qa-MiniLM-L6-cos-v1",
    "all-MiniLM-L6-v1",
    "paraphrase-mpnet-base-v2",
    "msmarco-bert-base-dot-v5",
    "multi-qa-MiniLM-L6-dot-v1",
    "sentence-t5-base",
    "msmarco-distilbert-base-tas-b",
    "msmarco-distilbert-dot-v5",
    "paraphrase-distilroberta-base-v2",
    "paraphrase-MiniLM-L12-v2",
    "paraphrase-multilingual-mpnet-base-v2",
    "paraphrase-TinyBERT-L6-v2",
    "paraphrase-MiniLM-L6-v2",
    "paraphrase-albert-small-v2",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "paraphrase-MiniLM-L3-v2",
    "distiluse-base-multilingual-cased-v1",
    "distiluse-base-multilingual-cased-v2",
    "average_word_embeddings_komninos",
    "average_word_embeddings_glove.6B.300d"
]


model = SentenceTransformer(models[0])
category_embeddings = model.encode(categories)

In [3]:
def category_sim_pairs(categories, similarities):
    output = []
    for index in range(len(categories)):
        output.append( (categories[index], similarities[index]) )

    output = sorted(output, key=lambda x: x[1], reverse=True)
    return output

In [None]:
with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    description = entry["description"]
    title = entry["title"]

    # description -> vector
    description_embedding = model.encode(description)

    # cosine sim (description, category)
    similarities = util.cos_sim(description_embedding, category_embeddings)[0].tolist()
    pairs = category_sim_pairs(categories, similarities)

    print(f"Title: {title}:")
    #print(f"  Description: {description}")
    print(f"Categories:")
    for category, similarity in pairs:
        print(f"{category}: {similarity:.4f}")
    print("#"*30)

## Sentence transformer comparison with more descriptive categories and descriptions 

In [None]:
categories = [
    "This book is a romance novel about love and relationships.",
    "This book is a science fiction novel about futuristic concepts and technology.",
    "This book is a dystopian story set in a bleak or controlled society.",
    "This book is a thriller that is suspenseful and full of tension.",
    "This book is a historical fiction novel set in a past time period.",
    "This book is a drama focusing on serious and emotional storytelling.",
    "This book is a mystery novel involving investigation and secrets.",
    "This book is a fantasy novel featuring magic, mythical creatures, or supernatural elements."
]

category_embeddings = model.encode(categories)


with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    title = entry["title"]
    description = entry["description"]

    formatted_description = f"This book is about: {description} The genre of this book is:"
    description_embedding = model.encode(formatted_description)

    similarities = util.cos_sim(description_embedding, category_embeddings)[0].tolist()
    pairs = category_sim_pairs(categories, similarities)

    print(f"Title: {title}")
    #print(f" Description: {description}")
    print(f"Categories:")
    for category, similarity in pairs:
        print(f"{category}: {similarity:.4f}")
    print("#"*30)


## Cross Encoder Comparison 

In [None]:
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    description = entry["description"]
    title = entry["title"]

    pairs = [(description, category) for category in categories]
    #similarity scores
    scores = model.predict(pairs)  

    ranked_categories = category_sim_pairs(categories, scores)

    print(f"Title: {title}")
    print("Categories:")
    for category, score in ranked_categories:
        print(f"{category}: {score:.4f}")
    print("#" * 30)

## CLIP image comparison

In [None]:
model = SentenceTransformer("clip-ViT-B-32")

image_folder = "images/"

# Load image filenames 
image_files = []
for file in os.listdir(image_folder):
    if file.endswith(("jpg", "jpeg", "png")):
        image_files.append(file)

image_embeddings = {}
for img_file in image_files:
    img_path = os.path.join(image_folder, img_file)
    img_emb = model.encode(Image.open(img_path))
    image_embeddings[img_file] = img_emb

def rank_images(description, image_embeddings):
    text_emb = model.encode([description]) 
    # similarity scores
    scores = {img: model.similarity(text_emb, img_emb)[0][0] for img, img_emb in image_embeddings.items()}

    sorted_scores = []
    for key, value in scores.items():
        sorted_scores.append((key, value))
    sorted_scores = sorted(sorted_scores, key=lambda x: x[1], reverse=True)
    return tuple(sorted_scores)

with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    title = entry["title"]
    description = entry["description"]

    ranked_images = rank_images(description, image_embeddings)

    print(f"Title: {title}")
    print("Most Similar Images:")
    for img, score in ranked_images:
        print(f"{img}: {score:.4f}")
    print("#" * 30)


## Sentence transformer comparison with [NV-Embed-v2](https://huggingface.co/spaces/mteb/leaderboard)

(A really big model that isnt working rn)

In [None]:
'''
import json
import torch
import torch.nn.functional as F
import gc
from transformers import AutoModel


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True).to(device)

query_prefix = "Instruct: Categorize the following book description into a genre.\nQuery: "
category_prefix = ""

MAX_LENGTH = 256

def get_normalized_embeddings(texts, instruction):
    with torch.no_grad():  # Prevents memory buildup
        embeddings = model.encode(
            texts, 
            instruction=instruction, 
            max_length=MAX_LENGTH
        ).to(device)
        return F.normalize(embeddings, p=2, dim=1)


category_embeddings = get_normalized_embeddings(categories, category_prefix)
def rank_categories(description, categories, category_embeddings):
    description_embedding = get_normalized_embeddings([description], query_prefix)
    scores = (description_embedding @ category_embeddings.T) * 100
    scores = scores[0].tolist()
    return sorted(zip(categories, scores), key=lambda x: x[1], reverse=True)

with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
for entry in data:
    title = entry["title"]
    description = entry["description"]
    ranked_categories = rank_categories(description, categories, category_embeddings)
    print(f"Title: {title}")
    print("Categories:")
    for category, score in ranked_categories:
        print(f"{category}: {score:.2f}")
    print("#" * 30)

    #FREE MEMORY AFTER EACH BOOK
    del ranked_categories
    torch.cuda.empty_cache()  # Clears VRAM (for GPU)
    gc.collect()  # Clears RAM (for CPU)
'''