In [35]:
import pandas as pd
import numpy as np
import nltk
from bertopic import BERTopic
from nltk import accuracy
from sklearn.model_selection import KFold

from helper_functions import preprocess_dataset, subset_sampler
from sklearn.model_selection import train_test_split
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import zip_longest

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/Tomas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Tomas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
np.random.seed(1234)

genre_encoder = MultiLabelBinarizer()

df, df_unknown_genre = preprocess_dataset('data/wiki_movie_plots_deduped.csv')
df = subset_sampler(df, n=5000)

X = df['Processed_Plot'].values
y = genre_encoder.fit_transform(df['Genre'].values)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)



In [60]:
len(genre_encoder.classes_)

15

In [44]:
def get_topics(text, topic_model, min_score=0.5, max_topics=3):
    topic_ids, scores = topic_model.find_topics(text)

    # Filter topics by threshold
    filtered = [(tid, score) for tid, score in zip(topic_ids, scores) if score >= min_score]

    if not filtered:  # no topic passes threshold
        return [-1], [1]

    # Return at most max_topics
    filtered = filtered[:max_topics]

    # Separate IDs and scores
    filtered_ids, filtered_scores = zip(*filtered)
    return list(filtered_ids), list(filtered_scores)

In [154]:
n_genres = len(genre_encoder.classes_)

kmeans_clusters = np.arange(20, 101, 10)
hdbscan_min_cluster_size = np.arange(50, 301, 25)
num_folds = 5

f1_scores_kmeans = np.zeros((num_folds, len(kmeans_clusters)))
accuracy_kmeans  = np.zeros((num_folds, len(kmeans_clusters)))

f1_scores_hdbscan = np.zeros((num_folds, len(hdbscan_min_cluster_size)))
accuracy_hdbscan  = np.zeros((num_folds, len(hdbscan_min_cluster_size)))


outer_kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

for fold_idx, (inner_train_idx, validation_idx) in enumerate(outer_kf.split(X)):
    X_inner_train, X_validation = X[inner_train_idx], X[validation_idx]
    y_inner_train, y_validation = y[inner_train_idx], y[validation_idx]

    majority_class = np.bincount(np.argmax(y_inner_train, axis=1)).argmax()


    for k_idx, kmeans_k in enumerate(kmeans_clusters):

        kmeans_model = KMeans(n_clusters=kmeans_k, random_state=42)
        topic_model_kmeans = BERTopic(hdbscan_model=kmeans_model)
        X_inner_train_topics_kmeans, X_inner_train_scores_kmeans = topic_model_kmeans.fit_transform(X_inner_train)
        X_inner_train_topics_kmeans, X_inner_train_scores_kmeans = np.array(X_inner_train_topics_kmeans), np.array(X_inner_train_scores_kmeans)

        topic_genre_matrix = np.zeros((kmeans_k, n_genres), dtype=int)
        for i,j in zip(X_inner_train_topics_kmeans, y_inner_train): #
            topic_genre_matrix[X_inner_train_topics_kmeans[i] , :] += j

        max_cols = np.argmax(topic_genre_matrix, axis=1)
        max_dict = {i: col.item() for i, col in enumerate(max_cols)}

        y_pred_topics = [get_topics(text, topic_model_kmeans, max_topics=1) for text in X_validation]
        y_pred_topics_flat = list(map(lambda x: x[0] if isinstance(x, list) else x, y_pred_topics))

        y_pred_genres = np.array(list(map(max_dict.get, y_pred_topics_flat)))


        ### Evaluation
        correct = [y_validation[i, pred] == 1 for i, pred in enumerate(y_pred_genres)]
        accuracy_kmeans[fold_idx,k_idx] = np.mean(correct).item()

    for h_idx, hdbscan_size in enumerate(hdbscan_min_cluster_size):

        hdbscan_model = HDBSCAN(
            min_cluster_size=hdbscan_size,
            metric='euclidean',
            cluster_selection_method='eom'
        )
        topic_model_hdbscan = BERTopic(hdbscan_model=hdbscan_model) #BERTopic(hdbscan_model=hdbscan_model)
        X_inner_train_topics_dbscan, X_inner_train_scores_dbscan = topic_model_hdbscan.fit_transform(X_inner_train)
        X_inner_train_topics_dbscan, X_inner_train_scores_dbscan = np.array(X_inner_train_topics_dbscan), np.array(X_inner_train_scores_dbscan)

        n_topics = len(np.unique(X_inner_train_topics_dbscan)) # number of unique topics

        topic_genre_matrix = np.zeros((n_topics, n_genres), dtype=int)
        for i,j in zip(X_inner_train_topics_dbscan, y_inner_train): # the topic corresponds to -1 (trash)
            topic_genre_matrix[X_inner_train_topics_dbscan[i] , :] += j # can  be down more efficiently


        max_cols = np.argmax(topic_genre_matrix, axis=1)
        max_dict = {i: col.item() for i, col in enumerate(max_cols)}

        y_pred_topics = [get_topics(text, topic_model_hdbscan, max_topics=1) for text in X_validation]
        y_pred_topics_flat = list(map(lambda x: x[0] if isinstance(x, list) else x, y_pred_topics))

        y_pred_genres = np.array(list(
            map(lambda t: max_dict[t] if t != -1 else max_dict[n_topics - 1], y_pred_topics_flat)
        ), dtype=int)

        correct = [y_validation[i, pred] == 1 for i, pred in enumerate(y_pred_genres)]
        accuracy_hdbscan[fold_idx,h_idx] = np.mean(correct).item()



mean_accuracy_kmeans = accuracy_kmeans.mean(axis=0)
print("Mean accuracy of K-means per number of clusters:")
for k, acc in zip(kmeans_clusters, mean_accuracy_kmeans):
    print(f"{k:>3}  →  {acc:.4f}")

mean_accuracy_hdbscan = accuracy_hdbscan.mean(axis=0)
print("Mean accuracy of HDBscan per min cluster size:")
for k, acc in zip(hdbscan_min_cluster_size, mean_accuracy_hdbscan):
    print(f"{k:>3}  →  {acc:.4f}")



KeyboardInterrupt: 

In [155]:
y_pred_topics

[[2],
 [0],
 -1,
 [2],
 [-1],
 [1],
 [1],
 [0],
 -1,
 [1],
 [1],
 [-1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [0],
 [2],
 -1,
 [2],
 -1,
 [1],
 [0],
 [1],
 [2],
 -1,
 -1,
 [3],
 [0],
 [1],
 [1],
 [2],
 -1,
 -1,
 -1,
 -1,
 [0],
 [5],
 [1],
 -1,
 [-1],
 [3],
 [1],
 [0],
 [2],
 [2],
 [-1],
 -1,
 -1,
 [2],
 [-1],
 [0],
 -1,
 [3],
 [1],
 [2],
 [4],
 [-1],
 -1,
 [1],
 [1],
 [6],
 -1,
 [0],
 [0],
 [2],
 [1],
 [0],
 [0],
 -1,
 [2],
 -1,
 [1],
 -1,
 [0],
 [2],
 [2],
 [0],
 [-1],
 [3],
 [2],
 [3],
 [1],
 -1,
 [2],
 [3],
 -1,
 [-1],
 [-1],
 -1,
 [1],
 [2],
 [2],
 [0],
 -1,
 [0],
 -1,
 [1],
 -1,
 -1,
 -1,
 [0],
 -1,
 [0],
 [1],
 -1,
 [0],
 [3],
 -1,
 [1],
 [-1],
 [1],
 [1],
 -1,
 -1,
 [0],
 -1,
 [0],
 [0],
 -1,
 [-1],
 [0],
 [6],
 -1,
 [2],
 [1],
 -1,
 -1,
 -1,
 [-1],
 [0],
 [1],
 -1,
 [1],
 [1],
 [0],
 [0],
 [1],
 -1,
 [3],
 [-1],
 -1,
 [0],
 [2],
 [2],
 -1,
 -1,
 [3],
 [0],
 [-1],
 [1],
 [3],
 -1,
 [0],
 -1,
 [0],
 [0],
 -1,
 [0],
 -1,
 [0],
 [0],
 [4],
 [0],
 -1,
 -1,
 [1],
 -1,
 [1],
 -1,
 [6],
 -1,


In [104]:
topic_genre_matrix

array([[ 6,  3,  4, ...,  2,  1,  7],
       [ 0,  0,  0, ...,  0,  0,  0],
       [23,  8,  8, ..., 12,  2, 17],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], shape=(1000, 15))

In [110]:
new_matrix = topic_genre_matrix.copy()

np.add.at(new_matrix, X_inner_train_topics_kmeans, y_inner_train)
new_matrix

array([[34, 16, 17, ..., 11,  3, 29],
       [32,  9,  8, ...,  6,  1, 23],
       [52, 17, 17, ..., 23,  3, 40],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], shape=(1000, 15))

In [106]:
topic_genre_matrix

array([[ 6,  3,  4, ...,  2,  1,  7],
       [ 0,  0,  0, ...,  0,  0,  0],
       [23,  8,  8, ..., 12,  2, 17],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], shape=(1000, 15))

In [32]:
topic_model_hdbscan.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1271,-1_one_back_new_tells,"[one, back, new, tells, two, home, father, hou...",[small new england town frigid winter season 1...
1,0,1726,0_love_father_life_one,"[love, father, life, one, family, film, also, ...",[chandrashekharshekhar shammi kapoor belongs a...
2,1,182,1_ship_captain_war_german,"[ship, captain, war, german, men, crew, one, b...",[1944 united states army air forces {usaaf air...
3,2,166,2_police_car_house_one,"[police, car, house, one, kill, finds, find, l...",[john michael chip hardesty james stewart narr...
4,3,152,3_tells_new_home_one,"[tells, new, home, one, night, house, miss, mi...",[1961 long island alice bloom eliza dushku ten...
5,4,127,4_li_wong_chen_china,"[li, wong, chen, china, zhang, cheng, dragon, ...",[film set foshan china sometime late 19th cent...
6,5,89,5_billy_game_one_new,"[billy, game, one, new, get, fight, money, tj,...",[rick penning sean faris high school rugby pla...
7,6,76,6_earth_space_crew_planet,"[earth, space, crew, planet, alien, ship, dr, ...",[year 2285 admiral james kirk oversees simulat...
8,7,76,7_gamera_goemon_nagiko_shirou,"[gamera, goemon, nagiko, shirou, sakura, death...",[child ishikawa goemons yōsuke eguchi entire f...
9,8,47,8_tom_jerry_bugs_cat,"[tom, jerry, bugs, cat, spike, toms, elmer, ba...",[tom mascot aboard cruise ship warned captain ...


In [5]:
topic_model_kmeans.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,242,0_tells_new_home_back,"[tells, new, home, back, school, john, one, mi...",[wearing original prom dress 1960 peggy sue bo...
1,1,237,1_love_story_married_life,"[love, story, married, life, family, comes, vi...",[madhavan elango nagarajah young man lives joi...
2,2,219,2_police_vijay_singh_shankar,"[police, vijay, singh, shankar, one, gets, com...",[four good nothing friends roy adi boman manav...
3,3,214,3_jonas_mother_father_young,"[jonas, mother, father, young, maggie, family,...",[buddy evans manages events madison square gar...
4,4,189,4_new_one_money_game,"[new, one, money, game, get, jim, tells, back,...",[jim cole wife angela along children charlie g...
5,5,180,5_love_marriage_father_family,"[love, marriage, father, family, gets, comes, ...",[chandrashekharshekhar shammi kapoor belongs a...
6,6,178,6_police_gang_car_danny,"[police, gang, car, danny, one, kill, gun, joh...",[robert douglas brian hooks prison one two fel...
7,7,174,7_film_story_love_woman,"[film, story, love, woman, life, movie, two, h...",[waking life unnamed young man living ethereal...
8,8,167,8_life_school_love_young,"[life, school, love, young, girl, father, woma...",[described film magazine david markelys dexter...
9,9,146,9_ship_captain_crew_men,"[ship, captain, crew, men, war, japanese, isla...",[waning days world war ii united states navy c...


In [6]:
counts = df_train['Genre'].value_counts()
print(counts)

Genre
[drama]                       1214
[comedy]                       844
[action]                       221
[horror]                       208
[thriller]                     186
                              ... 
[action, horror]                 1
[sci-fi, action, thriller]       1
[sci-fi, action, drama]          1
[comedy, mystery]                1
[crime, thriller, drama]         1
Name: count, Length: 116, dtype: int64


In [7]:
topic_model_hdbscan.visualize_topics()


In [53]:
df_test['Genre'].unique()

array(['romance', 'drama', 'sci-fi', 'comedy', 'crime', 'fantasy',
       'action', 'mystery', 'horror', 'thriller', 'adventure',
       'animation', 'sports', 'musical', 'documentary'], dtype=object)

In [54]:
len(df_test['Genre'].unique())

15

In [25]:
df_train

Unnamed: 0,Processed_Plot,Genre,Best_Topic
0,dr bill cortner jason evers saves patient pron...,"(sci-fi,)",0
1,early summer 1950 martin greer owns small cons...,"(romance,)",0
2,evil moon robot roman extension xj barrows ref...,"(sci-fi,)",0
3,highspeed freeway police chase seen meet brown...,"(comedy, drama)",0
4,jack ripper school period setting gaslit londo...,"(mystery,)",-1
...,...,...,...
795,pedda babu kota srinivasa rao chinna babu tani...,"(comedy,)",1
796,raja sanjay dutt love night club dancer rani m...,"(action,)",1
797,factbased adolescent melodrama joe fisk juveni...,"(romance,)",0
798,ahmad ibn fadlan court poet caliph baghdad amo...,"(action,)",-1


In [22]:
df_train

Unnamed: 0,Processed_Plot,Genre
0,rudolf domestic little black cat little town g...,"(adventure, drama)"
1,henderson schoolteacher living wife kath baby ...,"(comedy, drama)"
2,film two acts first taking place exotic locale...,"(sci-fi,)"
3,set 1909 coal mine yorkshire england used pit ...,"(drama,)"
4,young honey bee named barry b benson jerry sei...,"(animation,)"
...,...,...
795,formal concert tom tuxedo giving piano recital...,"(animation,)"
796,raja sanjay dutt love night club dancer rani m...,"(action,)"
797,film takes place doha shekar menon business ty...,"(drama,)"
798,ahmad ibn fadlan court poet caliph baghdad amo...,"(action,)"


In [156]:
car = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

x = car.get("price", 15000)

print(x)

15000
