In [20]:
import pandas as pd
import numpy as np
import nltk
import torch
import numpy as np
import nltk
from bertopic import BERTopic
from nltk import accuracy
from sklearn.model_selection import KFold

from helper_functions import preprocess_dataset, subset_sampler
from sklearn.model_selection import train_test_split
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import zip_longest

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Christian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Christian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
np.random.seed(1234)

genre_encoder = MultiLabelBinarizer()

df, df_unknown_genre = preprocess_dataset('data/wiki_movie_plots_deduped.csv')
df = subset_sampler(df, n=5000)

X = df['Processed_Plot'].values
y = genre_encoder.fit_transform(df['Genre'].values)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)



In [22]:
len(genre_encoder.classes_)

15

In [23]:
def get_topics(text, topic_model, min_score=0.5, max_topics=3):
    topic_ids, scores = topic_model.find_topics(text)

    # Filter topics by threshold
    filtered = [(tid, score) for tid, score in zip(topic_ids, scores) if score >= min_score]

    if not filtered:  # no topic passes threshold
        return [-1], [1]

    # Return at most max_topics
    filtered = filtered[:max_topics]

    # Separate IDs and scores
    filtered_ids, filtered_scores = zip(*filtered)
    return list(filtered_ids), list(filtered_scores)

In [24]:
n_genres = len(genre_encoder.classes_)

kmeans_clusters = np.arange(20, 101, 10)
hdbscan_min_cluster_size = np.arange(50, 301, 25)
num_folds = 5

f1_scores_kmeans = np.zeros((num_folds, len(kmeans_clusters)))
accuracy_kmeans  = np.zeros((num_folds, len(kmeans_clusters)))

f1_scores_hdbscan = np.zeros((num_folds, len(hdbscan_min_cluster_size)))
accuracy_hdbscan  = np.zeros((num_folds, len(hdbscan_min_cluster_size)))


outer_kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

for fold_idx, (inner_train_idx, validation_idx) in enumerate(outer_kf.split(X)):
    X_inner_train, X_validation = X[inner_train_idx], X[validation_idx]
    y_inner_train, y_validation = y[inner_train_idx], y[validation_idx]

    majority_class = np.bincount(np.argmax(y_inner_train, axis=1)).argmax()


    for k_idx, kmeans_k in enumerate(kmeans_clusters):

        kmeans_model = KMeans(n_clusters=kmeans_k, random_state=42)
        topic_model_kmeans = BERTopic(hdbscan_model=kmeans_model)
        X_inner_train_topics_kmeans, X_inner_train_scores_kmeans = topic_model_kmeans.fit_transform(X_inner_train)
        X_inner_train_topics_kmeans, X_inner_train_scores_kmeans = np.array(X_inner_train_topics_kmeans), np.array(X_inner_train_scores_kmeans)

        topic_genre_matrix = np.zeros((kmeans_k, n_genres), dtype=int)
        for i,j in zip(X_inner_train_topics_kmeans, y_inner_train): #
            topic_genre_matrix[X_inner_train_topics_kmeans[i] , :] += j

        max_cols = np.argmax(topic_genre_matrix, axis=1)
        max_dict = {i: col.item() for i, col in enumerate(max_cols)}

        y_pred_topics = [get_topics(text, topic_model_kmeans, max_topics=1) for text in X_validation]

        #print(y_pred_topics[0][0])
        #print(list(map(lambda x: x[0] if isinstance(x, tuple) else x, y_pred_topics)))
        #print(list(map(lambda x: x[0][0] if isinstance(x, tuple) else x, y_pred_topics)))
        

        y_pred_topics_flat = list(map(lambda x: x[0][0] if isinstance(x, tuple) else x, y_pred_topics))
        #y_pred_topics_flat = list(map(lambda x: x[0] if isinstance(x, list) else x, y_pred_topics))

        #print(y_pred_topics_flat)

        y_pred_genres = np.array(list(map(max_dict.get, y_pred_topics_flat)))

        #print(y_pred_genres)

        ### Evaluation
        #correct = [y_validation[i, pred] == 1 for (i, pred) in enumerate(y_pred_genres)]
        # Fixed list comprehension
        correct = []
        for i, pred in enumerate(y_pred_genres):
            if pred is None:
                # If prediction is missing, it's automatically incorrect
                correct.append(False)
            else:
                # Perform the actual check
                correct.append(y_validation[i, int(pred)] == 1)

        #print(correct)

        accuracy_kmeans[fold_idx,k_idx] = np.mean(correct).item()

    for h_idx, hdbscan_size in enumerate(hdbscan_min_cluster_size):

        hdbscan_model = HDBSCAN(
            min_cluster_size=hdbscan_size,
            metric='euclidean',
            cluster_selection_method='eom'
        )
        topic_model_hdbscan = BERTopic(hdbscan_model=hdbscan_model) #BERTopic(hdbscan_model=hdbscan_model)
        X_inner_train_topics_dbscan, X_inner_train_scores_dbscan = topic_model_hdbscan.fit_transform(X_inner_train)
        X_inner_train_topics_dbscan, X_inner_train_scores_dbscan = np.array(X_inner_train_topics_dbscan), np.array(X_inner_train_scores_dbscan)

        n_topics = len(np.unique(X_inner_train_topics_dbscan)) # number of unique topics

        topic_genre_matrix = np.zeros((n_topics, n_genres), dtype=int)
        for i,j in zip(X_inner_train_topics_dbscan, y_inner_train): # the topic corresponds to -1 (trash)
            topic_genre_matrix[X_inner_train_topics_dbscan[i] , :] += j # can  be down more efficiently


        max_cols = np.argmax(topic_genre_matrix, axis=1)
        max_dict = {i: col.item() for i, col in enumerate(max_cols)}

        y_pred_topics = [get_topics(text, topic_model_hdbscan, max_topics=1) for text in X_validation]
        #y_pred_topics_flat = list(map(lambda x: x[0] if isinstance(x, list) else x, y_pred_topics))
        y_pred_topics_flat = list(map(lambda x: x[0][0] if isinstance(x, tuple) else x, y_pred_topics))

        y_pred_genres = np.array(list(
            map(lambda t: max_dict[t] if t != -1 else max_dict[n_topics - 1], y_pred_topics_flat)
        ), dtype=int)

        #correct = [y_validation[i, pred] == 1 for i, pred in enumerate(y_pred_genres)]
        correct = []
        for i, pred in enumerate(y_pred_genres):
            if pred is None:
                # If prediction is missing, it's automatically incorrect
                correct.append(False)
            else:
                # Perform the actual check
                correct.append(y_validation[i, int(pred)] == 1)
        accuracy_hdbscan[fold_idx,h_idx] = np.mean(correct).item()



mean_accuracy_kmeans = accuracy_kmeans.mean(axis=0)
print("Mean accuracy of K-means per number of clusters:")
for k, acc in zip(kmeans_clusters, mean_accuracy_kmeans):
    print(f"{k:>3}  →  {acc:.4f}")

mean_accuracy_hdbscan = accuracy_hdbscan.mean(axis=0)
print("Mean accuracy of HDBscan per min cluster size:")
for k, acc in zip(hdbscan_min_cluster_size, mean_accuracy_hdbscan):
    print(f"{k:>3}  →  {acc:.4f}")



Mean accuracy of K-means per number of clusters:
 20  →  0.1932
 30  →  0.1898
 40  →  0.2008
 50  →  0.2042
 60  →  0.1938
 70  →  0.2042
 80  →  0.2060
 90  →  0.2212
100  →  0.2006
Mean accuracy of HDBscan per min cluster size:
 50  →  0.3078
 75  →  0.3344
100  →  0.2898
125  →  0.2850
150  →  0.2896
175  →  0.2628
200  →  0.2586
225  →  0.3012
250  →  0.3124
275  →  0.3166
300  →  0.3120


In [25]:
y_pred_topics

[([-1], [0.5065247]),
 ([-1], [0.58486444]),
 ([-1], [1]),
 ([-1], [0.59562176]),
 ([0], [0.5501799]),
 ([0], [0.56049424]),
 ([-1], [1]),
 ([1], [0.5370527]),
 ([1], [0.6228583]),
 ([0], [0.64433706]),
 ([-1], [0.50917006]),
 ([0], [0.6387816]),
 ([-1], [1]),
 ([0], [0.5951599]),
 ([-1], [1]),
 ([-1], [1]),
 ([0], [0.5973784]),
 ([-1], [1]),
 ([-1], [0.501987]),
 ([-1], [1]),
 ([-1], [0.55055153]),
 ([0], [0.56467474]),
 ([1], [0.5849982]),
 ([1], [0.69094616]),
 ([-1], [0.55045396]),
 ([0], [0.6370654]),
 ([-1], [0.56096]),
 ([1], [0.6920403]),
 ([0], [0.55378294]),
 ([-1], [1]),
 ([0], [0.56326026]),
 ([1], [0.6061687]),
 ([-1], [1]),
 ([-1], [1]),
 ([0], [0.5376]),
 ([-1], [1]),
 ([-1], [0.5279813]),
 ([-1], [1]),
 ([-1], [1]),
 ([-1], [1]),
 ([0], [0.61424035]),
 ([0], [0.5712607]),
 ([0], [0.6396371]),
 ([1], [0.61452186]),
 ([-1], [1]),
 ([0], [0.5214735]),
 ([0], [0.5047509]),
 ([-1], [1]),
 ([-1], [1]),
 ([-1], [1]),
 ([-1], [0.5143128]),
 ([-1], [1]),
 ([-1], [1]),
 ([-1], [1

In [26]:
topic_genre_matrix

array([[ 224,  108,  103,  917,  152,   17, 1208,   78,  222,  113,   71,
         117,  130,   14,  200],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [ 167,    5,    5,  157,   24,    3,  283,    5,   20,   12,    5,
         205,    4,    1,   65]])

In [28]:
new_matrix = topic_genre_matrix.copy()

print(new_matrix.shape, X_inner_train_topics_kmeans.shape, y_inner_train.shape)

np.add.at(new_matrix, X_inner_train_topics_kmeans, y_inner_train)
new_matrix

(3, 15) (4000,) (4000, 15)


IndexError: index 16 is out of bounds for axis 0 with size 3

In [None]:
topic_genre_matrix

In [None]:
topic_model_hdbscan.get_topic_info()

In [None]:
topic_model_kmeans.get_topic_info()

In [None]:
counts = df_train['Genre'].value_counts()
print(counts)

In [None]:
topic_model_hdbscan.visualize_topics()


In [None]:
df_test['Genre'].unique()

In [None]:
len(df_test['Genre'].unique())

In [None]:
df_train

In [None]:
df_train

In [None]:
car = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

x = car.get("price", 15000)

print(x)