In [71]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import names
from sklearn.cluster import MiniBatchKMeans
import joblib

In [None]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")

In [73]:
df = pd.read_csv('ml_data/all_movies.csv')
df.rename(columns={'name': 'Title'}, inplace=True)
df.dropna(subset=['description'], inplace=True)
df = df.head(n=50000)
df['Plot'] = df['tagline'].fillna('') + "\n" + df['description']
docs = df["Plot"].tolist()

# embeddings = sbert.encode(docs, show_progress_bar=True)
embeddings = joblib.load("ml_data/all_movies_emb.pkl")

In [90]:
df = pd.read_csv('ml_data/train.csv')
docs = df['Plot'].tolist()

# embeddings = sbert.encode(docs, show_progress_bar=True)
embeddings = joblib.load("ml_data/train_emb.pkl")

## BERTopic

In [None]:
from bertopic import BERTopic
from umap import UMAP
import hdbscan

In [None]:
# 1) Embedding e vectorizer
sbert = SentenceTransformer("all-MiniLM-L6-v2")
vect  = CountVectorizer(stop_words="english", min_df=10, max_df=0.8)

# 2) UMAP: scala più locale (n_neighbors↓) e min_dist leggermente >0
umap = UMAP(
    n_neighbors=25,
    n_components=15,     # preserva più info semantica
    min_dist=0.1,
    metric="cosine",
    random_state=42
)

# 3) HDBSCAN super‐permissivo
hdb = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=1,
    cluster_selection_method="leaf",
    cluster_selection_epsilon=0.15,  # **chiave**: rilassa i confini
    prediction_data=True
)

topic_model = BERTopic(
    embedding_model=sbert,
    vectorizer_model=vect,
    umap_model=umap,
    hdbscan_model=hdb,
    nr_topics="auto",
    calculate_probabilities=True,
    verbose=True
)

In [36]:
topics, probs = topic_model.fit_transform(docs)

2025-05-26 09:51:12,699 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2025-05-26 09:51:52,370 - BERTopic - Embedding - Completed ✓
2025-05-26 09:51:52,371 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-26 09:53:30,620 - BERTopic - Dimensionality - Completed ✓
2025-05-26 09:53:30,625 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-26 09:57:25,650 - BERTopic - Cluster - Completed ✓
2025-05-26 09:57:25,652 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-26 09:57:27,687 - BERTopic - Representation - Completed ✓
2025-05-26 09:57:27,688 - BERTopic - Topic reduction - Reducing number of topics
2025-05-26 09:57:27,795 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-26 09:57:29,594 - BERTopic - Representation - Completed ✓
2025-05-26 09:57:29,601 - BERTopic - Topic reduction - Reduced number of topics from 237 to 36


In [37]:
topic_info = topic_model.get_topic_info()
print(topic_info[["Topic", "Count", "Name"]])

    Topic  Count                                     Name
0      -1  29986                -1_love_family_woman_film
1       0  18105                   0_film_love_school_war
2       1    599                 1_team_game_fight_career
3       2    101              2_food_france_business_deal
4       3    100           3_dance_competition_york_stage
5       4     93                4_future_human_earth_like
6       5     90                      5_race_car_road_win
7       6     72                 6_dance_fun_friends_gets
8       7     63            7_star_documentary_far_making
9       8     52             8_dr_investigate_john_series
10      9     52               9_queen_magic_king_village
11     10     51           10_modern_film_passion_leading
12     11     47            11_gang_mystery_adventure_fun
13     12     37  12_game_relationship_especially_brother
14     13     36            13_magic_assistant_act_powers
15     14     32            14_dream_american_fight_group
16     15     

## K-Means

In [91]:
K = 100
kmeans = MiniBatchKMeans(
    n_clusters=K,
    batch_size=2048,
    random_state=99
)
labels = kmeans.fit_predict(embeddings)

In [92]:
# 1. Associa i label al DataFrame
df["Topic"] = labels

# 2. Estrai la matrice conteggi per tutto il corpus
vectorizer = CountVectorizer(stop_words="english", min_df=20, max_df=0.8)
X_counts = vectorizer.fit_transform(docs)  # docs = lista di tutte le trame

# 3. Somma i counts per cluster
K = kmeans.n_clusters
term_counts_per_cluster = np.zeros((K, X_counts.shape[1]))
for i, lab in enumerate(labels):
    term_counts_per_cluster[lab] += X_counts[i].toarray().ravel()

# 4. Calcola TF di classe e IDF di classe
tf_c = term_counts_per_cluster / term_counts_per_cluster.sum(axis=1, keepdims=True)
clusters_with_term = np.count_nonzero(term_counts_per_cluster > 0, axis=0)
idf_c = np.log(K / clusters_with_term)

# 5. Costruisci la matrice c-TF-IDF
c_tf_idf = tf_c * idf_c

# 6. Estrai le top-10 keywords per cluster
terms = np.array(vectorizer.get_feature_names_out())
cluster_keywords = {
    c: terms[np.argsort(-c_tf_idf[c])[:25]].tolist()
    for c in range(K)
}

# 7. Rimuovi nomi propri con NLTK
nltk.download("names", quiet=True)
all_names = set(n.lower() for n in names.words())

filtered_cluster_keywords = {}
for c, kws in cluster_keywords.items():
    nonames = [w for w in kws if w.lower() not in all_names]
    # prendi le prime 4 parole non-nome
    filtered_cluster_keywords[c] = nonames[:5]

# 8. Ricostruisci topic_info (senza colonna Keywords)
counts = df["Topic"].value_counts().sort_index()
topic_info = pd.DataFrame({
    "Topic": counts.index,
    "Count": counts.values,
    "Name":  [", ".join(filtered_cluster_keywords[t]) for t in counts.index]
})

In [93]:
for topic in sorted(df["Topic"].unique()):
    print(f"\n=== Topic {topic}: {', '.join(filtered_cluster_keywords[topic])} ===")
    print(df[df["Topic"] == topic]['Title'].to_list()[:20])


=== Topic 0: adventure, crew, heart, woman, forces ===
["Pirates of the Caribbean: At World's End", 'Pirates of the Caribbean: On Stranger Tides', "Pirates of the Caribbean: Dead Man's Chest", 'Pirates of the Caribbean: The Curse of the Black Pearl', 'Pirates of the Caribbean: Dead Men Tell No Tales']

=== Topic 1: war, story, lives, world, future ===
['The Shape of Water', 'Shadow of Fire', 'Cloud Atlas', 'The Brutalist', '300', "Don't Be Bad", 'Napoleon', 'Life Is Beautiful', "Schindler's List", 'The Lost City of Z', 'The Legend of 1900', "Guillermo del Toro's Pinocchio", 'Babylon', 'The Zone of Interest', 'Tear Along the Dotted Line', 'The American Backyard', 'Green Border', 'Jojo Rabbit']

=== Topic 2: bond, car, left, drug, inside ===
['Blade Runner 2049', 'Blade Runner', 'Licence to Kill', 'Good Bye, Lenin!', 'Becoming Bond', "That's Life"]

=== Topic 3: forces, evil, family, agent, child ===
['The Devil All the Time', 'Fantastic Beasts and Where to Find Them', 'CODA', 'Hellboy'

In [87]:
df_test = pd.read_csv('ml_data/train.csv')
embeddings_test = joblib.load("ml_data/train_emb.pkl")

In [88]:
labels_test = kmeans.predict(embeddings_test)

df_test["Topic"] = labels_test
df_test["Topic_Name"] = [
    ", ".join(filtered_cluster_keywords[label])
    for label in labels_test
]

for topic in sorted(df_test["Topic"].unique()):
    print(f"\n=== Topic {topic}: {', '.join(filtered_cluster_keywords[topic])} ===")
    print(df_test[df_test["Topic"] == topic]['Title'].to_list())


=== Topic 0: arctic, ice, frozen, warming, prehistoric ===
['Madagascar: Escape 2 Africa', 'The Thing', 'Snowpiercer']

=== Topic 1: life, man, new, world, happiness ===
['Yes Man', 'Following', 'Collateral Beauty', '21 & Over', 'Shaun of the Dead', 'Locke', 'A Single Man']

=== Topic 5: father, man, revenge, murdered, killed ===
['The Northman', 'The Power of the Dog', 'The Godfather', 'Furious 7', "Lemony Snicket's A Series of Unfortunate Events", 'I Stand Alone']

=== Topic 6: life, world, wife, paris, new ===
['21 Grams', 'Buried', "The Guy Who Didn't Like Musicals", 'Paul', 'Dune: Part Two']

=== Topic 7: prison, inmates, jail, released, sentence ===
['Ant-Man and the Wasp', 'I Can Quit Whenever I Want 3: Ad Honorem', 'Escape from New York', 'The Shawshank Redemption', 'Oldboy']

=== Topic 8: ghost, haunted, house, ghosts, paranormal ===
['Ju-On: Origins', 'Insidious: Chapter 2', 'Paranormal Activity', 'The Haunting', 'The Murmuring', 'Scary Stories to Tell in the Dark', 'I Am th

In [86]:
df_test.Topic.nunique()

127

### Inference

In [None]:
import joblib
from sentence_transformers import SentenceTransformer
import pandas as pd

joblib.dump(kmeans, "plottopic/kmeans_model.pkl")
joblib.dump(filtered_cluster_keywords, "plottopic/cluster_keywords.pkl")

In [None]:
sbert = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
kmeans = joblib.load("plottopic/kmeans_model.pkl")
keyword_map = joblib.load("plottopic/cluster_keywords.pkl")

def plottopic(text: str) -> str:
    emb = sbert.encode([text], show_progress_bar=False)
    label = kmeans.predict(emb)[0]
    return str(label) + "_" + "_".join(keyword_map[label])

In [39]:
test = pd.read_csv("ml_data/test.csv")

In [43]:
test['Topic'] = test.Plot.apply(plottopic)
for topic in sorted(test["Topic"].unique()):
    print(f"\n=== {topic.upper()} ===")
    print(test[test["Topic"] == topic]['Title'].to_list()[:20])


=== 10_CREW_GAME_GANG_DETECTIVE_BROTHER ===
['Die Hard', '2 Fast 2 Furious', 'In Bruges']

=== 11_MURDER_REAL_FILM_CRIME_DAY ===
['The Secret Life of Walter Mitty', 'Knight of Cups', 'Boris: The Film']

=== 12_VILLAGE_POWERS_MOTHER_BOY_DAY ===
["The Emperor's New Groove", 'Ice Age: Dawn of the Dinosaurs', 'Princess Mononoke']

=== 13_MURDER_CRIME_DETECTIVE_KILLER_DEAD ===
["Ask Me If I'm Happy", 'El Conde', 'The Da Vinci Code', 'Nosferatu the Vampyre', 'Catch Me If You Can', 'GoodFellas']

=== 14_SISTER_TRUE_WOMAN_BEAUTIFUL_MOTHER ===
['Donkey Skin', 'Disenchanted', 'Cinderella', 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 'Aladdin', 'Saint Maud', 'Alice in Wonderland', 'Enchanted', 'Alice Through the Looking Glass', 'The Princess and the Frog']

=== 15_RELATIONSHIP_FRIENDS_COUPLE_END_YORK ===
['Revenge', 'Love Actually', 'Easy Rider', 'The Immature', 'The Banshees of Inisherin']

=== 17_MOTHER_YEAR_DAUGHTER_OLD_FATHER ===
['Pearl', 'Echo']

=== 18_FIGHT_LEFT_FAC