# Import Library

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, VisualRepresentation
from bertopic.backend import MultiModalBackend
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
import hdbscan
import base64
from io import BytesIO
from IPython.display import HTML

# Process Datasets

In [9]:
TRAIN_FOLDER = 'Datasets/RawData/train'
TEST_FOLDER = 'Datasets/RawData/test'
VALID_FOLDER = 'Datasets/RawData/valid'

In [10]:
def load_image_paths_and_annotations(image_folder: str, annotation_file: str) -> tuple:
    """
    Load image paths and annotations.
    :param image_folder: Folder containing the images.
    :param annotation_file: CSV file containing the annotations.
    :return: Tuple containing a list of image paths and a pandas DataFrame containing the annotations.
    """
    annotations = pd.read_csv(annotation_file)
    image_paths = []
    labels = []

    for _, row in tqdm(annotations.iterrows(), total=annotations.shape[0], desc='Loading image paths'):
        try:
            img_filename = row['filename']
            img_path = os.path.join(image_folder, img_filename)
            if os.path.exists(img_path):
                image_paths.append(img_path)
                labels.append(row['class'])
            else:
                print(f'Image file {img_filename} does not exist at {img_path}')
        except Exception as e:
            print(f'Error processing image {img_filename}: {e}')
            continue

    return image_paths, labels, annotations

def image_base64(im: str) -> str:
    """
    Convert an image to base64.
    :param im: Path to the image.
    :return: Base64 encoding of the image.
    """
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im: str) -> str:
    """
    Display an image in a Jupyter notebook.
    :param im: Path to the image.
    :return: HTML image tag.
    """
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

In [11]:
test_images, test_labels, test_annotations = load_image_paths_and_annotations(TEST_FOLDER, os.path.join(TEST_FOLDER, '_annotations.csv'))
val_images, val_labels, val_annotations = load_image_paths_and_annotations(VALID_FOLDER, os.path.join(VALID_FOLDER, '_annotations.csv'))
train_images, train_labels, train_annotations = load_image_paths_and_annotations(TRAIN_FOLDER, os.path.join(TRAIN_FOLDER, '_annotations.csv'))

Loading image paths: 100%|██████████| 369/369 [00:00<00:00, 1367.93it/s]


Image file cocaine-powder-lines-rolled-banknote-and-drugs-in-plastic-bag-pocket-on-black-glass-surface-background-top-view-drug-addiction-concept-2NTT1Y1_jpg.rf.bf560732a757cb000614866c7fa8fc30.jpg does not exist at Datasets/RawData/test\cocaine-powder-lines-rolled-banknote-and-drugs-in-plastic-bag-pocket-on-black-glass-surface-background-top-view-drug-addiction-concept-2NTT1Y1_jpg.rf.bf560732a757cb000614866c7fa8fc30.jpg


Loading image paths: 100%|██████████| 730/730 [00:00<00:00, 3012.16it/s]
Loading image paths:  27%|██▋       | 721/2631 [00:00<00:00, 1934.06it/s]

Image file drugs-narcotics-business-concept-cocaine-plastic-packets-gun-us-dollars-banknotes-table-white-powder-addiction-crime_771335-12925_jpg.rf.4c4ad70061bde378530068b3c0e43156.jpg does not exist at Datasets/RawData/train\drugs-narcotics-business-concept-cocaine-plastic-packets-gun-us-dollars-banknotes-table-white-powder-addiction-crime_771335-12925_jpg.rf.4c4ad70061bde378530068b3c0e43156.jpg


Loading image paths: 100%|██████████| 2631/2631 [00:01<00:00, 2227.84it/s]

Image file drugs-and-dollars-on-glass-black-table-narcotic-powder-divided-into-stripes-on-a-mirrored-table-one-hundred-dollar-tube-for-drug-use-2FMY1GY_jpg.rf.c501c958fefe6e5f2ced1d5d7ffd6231.jpg does not exist at Datasets/RawData/train\drugs-and-dollars-on-glass-black-table-narcotic-powder-divided-into-stripes-on-a-mirrored-table-one-hundred-dollar-tube-for-drug-use-2FMY1GY_jpg.rf.c501c958fefe6e5f2ced1d5d7ffd6231.jpg





In [12]:
images = train_images + test_images
labels = train_labels + test_labels

# Prepare Model

## Pre-Compute embeddings

In [13]:
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
embedd = embedding_model.embed(documents=labels, images=images, verbose=True)
embedd_only_images = embedding_model.embed_images(images=images, verbose=True)
np.savez_compressed('Embeddings/embedding_images.npz', embedd)
np.savez_compressed('Embeddings/embedding_only_images.npz', embedd_only_images)

100%|██████████| 94/94 [08:36<00:00,  5.50s/it]
100%|██████████| 94/94 [08:57<00:00,  5.72s/it]


In [14]:
with np.load('Embeddings/embedding_only_images.npz') as data1, np.load('Embeddings/embedding_images.npz') as data2:
    embedd_only_images = data1['arr_0']
    embedd = data2['arr_0']

## Build BERTopic Model

In [20]:
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=150, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Embeddings image models
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)

# Visual model
visual_model = VisualRepresentation(image_to_text_model="nlpconnect/vit-gpt2-image-captioning", nr_samples=20, nr_repr_images=500, image_height=800)

representation_model = {
   "Visual_Aspect":  visual_model,
   "KeyBERTInspired": kw
}

topic_model = BERTopic(
                      min_topic_size=50,
                      top_n_words=5,
                      n_gram_range=(1, 3),
                      representation_model=representation_model,
                      vectorizer_model=vectorizer_model,
                      ctfidf_model=ctfidf_model,
                      embedding_model=embedding_model,
                      umap_model=umap_model,
                      hdbscan_model=hdbscan_model,
                      verbose=True)

topics, probs = topic_model.fit_transform(documents=labels, images=images, embeddings=embedd_only_images)

2024-08-26 15:56:48,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-26 15:57:16,868 - BERTopic - Dimensionality - Completed ✓
2024-08-26 15:57:16,887 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-26 15:57:17,278 - BERTopic - Cluster - Completed ✓
2024-08-26 15:57:17,398 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 5/5 [00:01<00:00,  3.59it/s]
2024-08-26 15:57:20,164 - BERTopic - Representation - Completed ✓


# Show Results

In [21]:
topic_model.topic_aspects_["Visual_Aspect"]

{-1: <PIL.Image.Image image mode=RGB size=418x800>,
 0: <PIL.Image.Image image mode=RGB size=320x800>,
 1: <PIL.Image.Image image mode=RGB size=416x800>,
 2: <PIL.Image.Image image mode=RGB size=213x800>,
 3: <PIL.Image.Image image mode=RGB size=267x800>}

In [22]:
# Extract dataframe
df = topic_model.get_topic_info().drop(["Representative_Docs", "Name", "Representation"], axis=1)[["Topic", "Count", "KeyBERTInspired", "Visual_Aspect"]]

# Visualize the images
HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))

Unnamed: 0,Topic,Count,KeyBERTInspired,Visual_Aspect
0,-1,151,"[drug, people, , , , , , , , ]",
1,0,2152,"[gun, , , , , , , , , ]",
2,1,342,"[drug, , , , , , , , , ]",
3,2,287,"[people, , , , , , , , , ]",
4,3,65,"[people, drug, , , , , , , , ]",


# Model Graphs

In [23]:
topic_model.visualize_barchart()

![image-2.png](attachment:image-2.png)

In [26]:
topic_model.visualize_topics()

![image-2.png](attachment:image-2.png)

In [27]:
topic_model.visualize_heatmap()

![image-2.png](attachment:image-2.png)

In [28]:
topic_model.visualize_hierarchy()

![image-2.png](attachment:image-2.png)

In [29]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embedd)

In [30]:
topic_model.visualize_documents(labels, reduced_embeddings=reduced_embeddings, hide_document_hover=True)

![image-2.png](attachment:image-2.png)

In [None]:
topic_model.visualize_document_datamap(labels, embeddings=embedd)

![4DataMap-2.png](attachment:4DataMap-2.png)

In [32]:
topic_model.visualize_term_rank(log_scale=True)

![image-2.png](attachment:image-2.png)

# Save Model

In [33]:
topic_model.save("Models/topic_visual_model_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [None]:
topic_model.push_to_hf_hub(
    repo_id="D0men1c0/ISSR_Visual_Model",
    save_embedding_model=embedding_model,
    save_ctfidf=True
)

# Predict

In [35]:
topic_model = BERTopic.load('D0men1c0/ISSR_Visual_Model', embedding_model=MultiModalBackend('clip-ViT-B-32', batch_size=32))

In [None]:
sentence = ["coke"]
topic, _ = topic_model.transform(val_labels, images=val_images)
topic_model.get_topic_info(topic[0])

In [40]:
topic

array([ 0,  0,  3,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,  0,
        0,  1,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
        0,  0,  0,  0,  0,  1,  2,  0,  2,  0,  0,  0,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,
        1,  3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
        0,  0,  0,  0,  1,  1,  0,  2,  2,  0,  0,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  0,  0,  0,  2,  0,  0,  2,  2,  2,  2,  0,  1,  0,  2,
        1,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  2,  1,  0,  1,
        1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  3,  3,  1,  0,  1,  0,  0,
        1,  0,  1,  0,  0,  0,  0,  1,  0,  0,  2,  2,  2,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  0