# Classification with Embeddings

Documentation: https://sbert.net/index.html

- lDA/ topic modeling/clustering 

## Imports

In [1]:
import json
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import torch.nn.functional as F
from transformers import AutoModel

import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import json
import os
from sentence_transformers import SentenceTransformer
from PIL import Image

## Sentence transformer comparison

In [2]:
categories = ["Romance","Science Fiction","Dystopian","Thriller","Historical Fiction","Drama","Mystery","Fantasy"]

models = [
    "all-mpnet-base-v2",
    "gtr-t5-xxl",
    "gtr-t5-xl",
    "sentence-t5-xxl",
    "gtr-t5-large",
    "all-mpnet-base-v1",
    "multi-qa-mpnet-base-dot-v1",
    "multi-qa-mpnet-base-cos-v1",
    "all-roberta-large-v1",
    "sentence-t5-xl",
    "all-distilroberta-v1",
    "all-MiniLM-L12-v1",
    "all-MiniLM-L12-v2",
    "multi-qa-distilbert-dot-v1",
    "multi-qa-distilbert-cos-v1",
    "gtr-t5-base",
    "sentence-t5-large",
    "all-MiniLM-L6-v2",
    "multi-qa-MiniLM-L6-cos-v1",
    "all-MiniLM-L6-v1",
    "paraphrase-mpnet-base-v2",
    "msmarco-bert-base-dot-v5",
    "multi-qa-MiniLM-L6-dot-v1",
    "sentence-t5-base",
    "msmarco-distilbert-base-tas-b",
    "msmarco-distilbert-dot-v5",
    "paraphrase-distilroberta-base-v2",
    "paraphrase-MiniLM-L12-v2",
    "paraphrase-multilingual-mpnet-base-v2",
    "paraphrase-TinyBERT-L6-v2",
    "paraphrase-MiniLM-L6-v2",
    "paraphrase-albert-small-v2",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "paraphrase-MiniLM-L3-v2",
    "distiluse-base-multilingual-cased-v1",
    "distiluse-base-multilingual-cased-v2",
    "average_word_embeddings_komninos",
    "average_word_embeddings_glove.6B.300d"
]


model = SentenceTransformer(models[0])
category_embeddings = model.encode(categories)

In [3]:
def category_sim_pairs(categories, similarities):
    output = []
    for index in range(len(categories)):
        output.append( (categories[index], similarities[index]) )

    output = sorted(output, key=lambda x: x[1], reverse=True)
    return output

In [4]:
with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    description = entry["description"]
    title = entry["title"]

    # description -> vector
    description_embedding = model.encode(description)

    # cosine sim (description, category)
    similarities = util.cos_sim(description_embedding, category_embeddings)[0].tolist()
    pairs = category_sim_pairs(categories, similarities)

    print(f"Title: {title}:")
    #print(f"  Description: {description}")
    print(f"Categories:")
    for category, similarity in pairs:
        print(f"{category}: {similarity:.4f}")
    print("#"*30)

Title: Pride and Prejudice:
Categories:
Historical Fiction: 0.4853
Romance: 0.2863
Drama: 0.2737
Science Fiction: 0.2226
Dystopian: 0.2207
Thriller: 0.2063
Fantasy: 0.1531
Mystery: 0.1091
##############################
Title: 1984:
Categories:
Dystopian: 0.4533
Historical Fiction: 0.3936
Science Fiction: 0.3577
Thriller: 0.3305
Drama: 0.2296
Romance: 0.1940
Mystery: 0.1890
Fantasy: 0.1253
##############################
Title: To Kill a Mockingbird:
Categories:
Historical Fiction: 0.4622
Dystopian: 0.3111
Thriller: 0.2244
Drama: 0.2138
Romance: 0.1927
Science Fiction: 0.1880
Fantasy: 0.1580
Mystery: 0.0943
##############################
Title: Moby Dick:
Categories:
Historical Fiction: 0.4002
Science Fiction: 0.2494
Thriller: 0.2146
Romance: 0.2027
Drama: 0.1939
Fantasy: 0.1662
Mystery: 0.1526
Dystopian: 0.1231
##############################
Title: The Great Gatsby:
Categories:
Historical Fiction: 0.3894
Romance: 0.2875
Dystopian: 0.2623
Drama: 0.2016
Fantasy: 0.1996
Science Fiction: 0.

## Sentence transformer comparison with more descriptive categories and descriptions 

In [5]:
categories = [
    "This book is a romance novel about love and relationships.",
    "This book is a science fiction novel about futuristic concepts and technology.",
    "This book is a dystopian story set in a bleak or controlled society.",
    "This book is a thriller that is suspenseful and full of tension.",
    "This book is a historical fiction novel set in a past time period.",
    "This book is a drama focusing on serious and emotional storytelling.",
    "This book is a mystery novel involving investigation and secrets.",
    "This book is a fantasy novel featuring magic, mythical creatures, or supernatural elements."
]

category_embeddings = model.encode(categories)


with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    title = entry["title"]
    description = entry["description"]

    formatted_description = f"This book is about: {description} The genre of this book is:"
    description_embedding = model.encode(formatted_description)

    similarities = util.cos_sim(description_embedding, category_embeddings)[0].tolist()
    pairs = category_sim_pairs(categories, similarities)

    print(f"Title: {title}")
    #print(f" Description: {description}")
    print(f"Categories:")
    for category, similarity in pairs:
        print(f"{category}: {similarity:.4f}")
    print("#"*30)


Title: Pride and Prejudice
Categories:
This book is a historical fiction novel set in a past time period.: 0.5110
This book is a romance novel about love and relationships.: 0.4981
This book is a dystopian story set in a bleak or controlled society.: 0.4380
This book is a drama focusing on serious and emotional storytelling.: 0.4352
This book is a mystery novel involving investigation and secrets.: 0.3859
This book is a fantasy novel featuring magic, mythical creatures, or supernatural elements.: 0.3715
This book is a thriller that is suspenseful and full of tension.: 0.3564
This book is a science fiction novel about futuristic concepts and technology.: 0.3388
##############################
Title: 1984
Categories:
This book is a dystopian story set in a bleak or controlled society.: 0.7005
This book is a science fiction novel about futuristic concepts and technology.: 0.5871
This book is a thriller that is suspenseful and full of tension.: 0.5838
This book is a mystery novel involving 

## Cross Encoder Comparison 

In [None]:
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    description = entry["description"]
    title = entry["title"]

    pairs = [(description, category) for category in categories]
    #similarity scores
    scores = model.predict(pairs)  

    ranked_categories = category_sim_pairs(categories, scores)

    print(f"Title: {title}")
    print("Categories:")
    for category, score in ranked_categories:
        print(f"{category}: {score:.4f}")
    print("#" * 30)

Title: Pride and Prejudice
Categories:
Romance: -4.8088
Mystery: -5.0798
Historical Fiction: -5.3943
Thriller: -5.4621
Fantasy: -5.8411
Drama: -6.2789
Science Fiction: -7.0336
Dystopian: -7.0575
##############################
Title: 1984
Categories:
Dystopian: -1.4093
Science Fiction: -5.0983
Historical Fiction: -5.5591
Thriller: -6.1774
Mystery: -7.1053
Fantasy: -7.5880
Drama: -7.6347
Romance: -8.1789
##############################
Title: To Kill a Mockingbird
Categories:
Thriller: -5.5741
Drama: -5.8375
Mystery: -6.1275
Historical Fiction: -6.3713
Romance: -6.4437
Fantasy: -6.5214
Science Fiction: -7.0558
Dystopian: -7.2150
##############################
Title: Moby Dick
Categories:
Mystery: -7.1695
Thriller: -7.4324
Fantasy: -7.6626
Drama: -8.3281
Romance: -8.3286
Historical Fiction: -8.5514
Science Fiction: -8.9625
Dystopian: -9.1300
##############################
Title: The Great Gatsby
Categories:
Romance: -7.9369
Mystery: -8.5277
Thriller: -8.8672
Fantasy: -8.9590
Drama: -9.0997

## CLIP image comparison

In [22]:
model = SentenceTransformer("clip-ViT-B-32")

image_folder = "images/"

# Load image filenames 
image_files = []
for file in os.listdir(image_folder):
    if file.endswith(("jpg", "jpeg", "png")):
        image_files.append(file)

image_embeddings = {}
for img_file in image_files:
    img_path = os.path.join(image_folder, img_file)
    img_emb = model.encode(Image.open(img_path))
    image_embeddings[img_file] = img_emb

def rank_images(description, image_embeddings):
    text_emb = model.encode([description]) 
    # similarity scores
    scores = {img: model.similarity(text_emb, img_emb)[0][0] for img, img_emb in image_embeddings.items()}

    sorted_scores = []
    for key, value in scores.items():
        sorted_scores.append((key, value))
    sorted_scores = sorted(sorted_scores, key=lambda x: x[1], reverse=True)
    return tuple(sorted_scores)

with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for entry in data:
    title = entry["title"]
    description = entry["description"]

    ranked_images = rank_images(description, image_embeddings)

    print(f"Title: {title}")
    print("Most Similar Images:")
    for img, score in ranked_images:
        print(f"{img}: {score:.4f}")
    print("#" * 30)


Title: Pride and Prejudice
Most Similar Images:
pride_and_prejudice.jpg: 0.3069
To_Kill_a_Mockingbird.jpg: 0.2852
The_Great_Gatsby.jpeg: 0.2795
1984.jpeg: 0.2705
moby_dick.jpg: 0.2521
War_and_Peace.jpg: 0.2456
The_Fellowship_of_the_Ring.jpg: 0.2324
Fahrenheit_451.jpg: 0.2226
The_Hitchhiker's_Guide_to_the_Galaxy.jpg: 0.2203
Harry_Potter_and_the_Philosopher's_Stone.jpg: 0.2000
##############################
Title: 1984
Most Similar Images:
1984.jpeg: 0.2572
The_Fellowship_of_the_Ring.jpg: 0.2480
The_Hitchhiker's_Guide_to_the_Galaxy.jpg: 0.2455
moby_dick.jpg: 0.2401
The_Great_Gatsby.jpeg: 0.2365
Fahrenheit_451.jpg: 0.2345
pride_and_prejudice.jpg: 0.2177
To_Kill_a_Mockingbird.jpg: 0.2166
War_and_Peace.jpg: 0.2014
Harry_Potter_and_the_Philosopher's_Stone.jpg: 0.1988
##############################
Title: To Kill a Mockingbird
Most Similar Images:
To_Kill_a_Mockingbird.jpg: 0.2814
The_Great_Gatsby.jpeg: 0.2628
moby_dick.jpg: 0.2390
1984.jpeg: 0.2172
The_Hitchhiker's_Guide_to_the_Galaxy.jpg: 0

## Sentence transformer comparison with [NV-Embed-v2](https://huggingface.co/spaces/mteb/leaderboard)

(A really big model that isnt working rn)

In [13]:
'''
import json
import torch
import torch.nn.functional as F
import gc
from transformers import AutoModel


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True).to(device)

query_prefix = "Instruct: Categorize the following book description into a genre.\nQuery: "
category_prefix = ""

MAX_LENGTH = 256

def get_normalized_embeddings(texts, instruction):
    with torch.no_grad():  # Prevents memory buildup
        embeddings = model.encode(
            texts, 
            instruction=instruction, 
            max_length=MAX_LENGTH
        ).to(device)
        return F.normalize(embeddings, p=2, dim=1)


category_embeddings = get_normalized_embeddings(categories, category_prefix)
def rank_categories(description, categories, category_embeddings):
    description_embedding = get_normalized_embeddings([description], query_prefix)
    scores = (description_embedding @ category_embeddings.T) * 100
    scores = scores[0].tolist()
    return sorted(zip(categories, scores), key=lambda x: x[1], reverse=True)

with open('books.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
for entry in data:
    title = entry["title"]
    description = entry["description"]
    ranked_categories = rank_categories(description, categories, category_embeddings)
    print(f"Title: {title}")
    print("Categories:")
    for category, score in ranked_categories:
        print(f"{category}: {score:.2f}")
    print("#" * 30)

    #FREE MEMORY AFTER EACH BOOK
    del ranked_categories
    torch.cuda.empty_cache()  # Clears VRAM (for GPU)
    gc.collect()  # Clears RAM (for CPU)
'''

'\nimport json\nimport torch\nimport torch.nn.functional as F\nimport gc\nfrom transformers import AutoModel\n\n# Define categories\ncategories = ["Romance", "Science Fiction", "Dystopian", "Thriller", \n              "Historical Fiction", "Drama", "Mystery", "Fantasy"]\n\n# Load Model\ndevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")\nmodel = AutoModel.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True).to(device)\n\n# Set prefix instructions\nquery_prefix = "Instruct: Categorize the following book description into a genre.\nQuery: "\ncategory_prefix = ""\n\n# Lower max_length to prevent excessive memory usage\nMAX_LENGTH = 256  # Reduced from 512 to 256\n\n# Function to encode text efficiently\ndef get_normalized_embeddings(texts, instruction):\n    with torch.no_grad():  # Prevents memory buildup\n        embeddings = model.encode(\n            texts, \n            instruction=instruction, \n            max_length=MAX_LENGTH\n        ).to(device)\n    