In [5]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan

In [2]:
df = pd.read_csv('ml_data/train.csv')
df.head(5)

Unnamed: 0,URI,Title,Year,Plot,Director,Genre1,Genre2,Genre3,Country,RatingAvg,RatingCount,MyRating
0,https://boxd.it/3d1m,Transformers: Age of Extinction,2014,As humanity picks up the pieces after the batt...,Michael Bay,Science Fiction,Adventure,Action,USA,2.12,235666.0,1.0
1,https://boxd.it/2NF6,Madagascar 3: Europe's Most Wanted,2012,"Animal pals Alex, Marty, Melman, and Gloria ar...",Eric Darnell,Comedy,Family,Adventure,USA,3.03,296110.0,3.0
2,https://boxd.it/fx08,The First Omen,2024,When a young American woman is sent to Rome to...,Arkasha Stevenson,Horror,,,USA,3.38,297652.0,3.5
3,https://boxd.it/2abQ,Terminator Salvation,2009,"All grown up in post-apocalyptic 2018, John Co...",McG,Thriller,Action,Science Fiction,USA,2.58,146188.0,2.5
4,https://boxd.it/mpWM,I Hate Summer,2020,Three families end up in the same rented house...,Massimo Venier,Comedy,,,Italy,3.25,7717.0,3.0


In [65]:
# 1. Embedding model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. CountVectorizer per il c-TF-IDF
vectorizer = CountVectorizer(
    stop_words="english",
    max_df=0.85,
    min_df=5
)

# 2. UMAP più globale e compatto
umap_model = UMAP(
    n_neighbors=75,     # scala globale→ outlier più vicini ai cluster
    n_components=20,     # dimensione ridotta “leggera”
    min_dist=0.0,       # punti molto compatti nel low-dim
    metric="cosine",
    random_state=42
)

# 3. HDBSCAN permissivo sui core points, “leaf” per meno outlier
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=20,       # cluster ≥ 20
    min_samples=1,             # densità minima più bassa possibile
    cluster_selection_method="leaf",
    prediction_data=True
)


# 4. Costruzione del topic model
topic_model = BERTopic(
    embedding_model=sbert_model,
    vectorizer_model=vectorizer,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    nr_topics="auto",
    calculate_probabilities=True,
    verbose=True
)

In [66]:
# 4. Fit + transform
docs = df["Plot"].tolist()
topics, probs = topic_model.fit_transform(docs)

2025-05-25 20:04:18,007 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

2025-05-25 20:04:27,464 - BERTopic - Embedding - Completed ✓
2025-05-25 20:04:27,465 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-25 20:04:33,915 - BERTopic - Dimensionality - Completed ✓
2025-05-25 20:04:33,915 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-25 20:04:34,049 - BERTopic - Cluster - Completed ✓
2025-05-25 20:04:34,051 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-25 20:04:34,110 - BERTopic - Representation - Completed ✓
2025-05-25 20:04:34,111 - BERTopic - Topic reduction - Reducing number of topics
2025-05-25 20:04:34,115 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-25 20:04:34,166 - BERTopic - Representation - Completed ✓
2025-05-25 20:04:34,166 - BERTopic - Topic reduction - Reduced number of topics from 17 to 13


In [67]:
topic_info = topic_model.get_topic_info()
print(topic_info[["Topic", "Count", "Name"]])

    Topic  Count                                Name
0      -1    764             -1_man_world_time_years
1       0    177              0_agent_man_world_case
2       1     99           1_woman_mother_home_house
3       2     64            2_planet_earth_crew_face
4       3     51            3_couple_york_road_night
5       4     47          4_quest_warrior_ring_power
6       5     45               5_story_film_rome_man
7       6     39               6_car_fight_drug_turn
8       7     31  7_relationship_school_decides_john
9       8     24       8_group_strangers_room_escape
10      9     22            9_princess_far_land_true
11     10     21     10_decision_begins_love_greater
12     11     20     11_attempts_success_years_night


In [68]:
df["Topic"] = topics
df["Topic_Prob"] = [p.max() if isinstance(p, (list, np.ndarray)) else None for p in probs]

for topic in sorted(df["Topic"].unique()):
    print(f"\nCluster {topic} — {len(df[df['Topic']==topic])} documenti")
    print(topic_model.get_topic_info()[topic_model.get_topic_info()["Topic"] == topic]['Name'].iloc[0])
    display(df[df["Topic"] == topic][["Title", "Plot", "Topic_Prob"]].sort_values(by="Topic_Prob", ascending=False).head(10))


Cluster -1 — 764 documenti
-1_man_world_time_years


Unnamed: 0,Title,Plot,Topic_Prob
650,The Suicide Squad,"Supervillains Harley Quinn, Bloodsport, Peacem...",0.35269
1045,The Cat o' Nine Tails,A newsman works with a blind puzzle-solver to ...,0.345286
976,Anatomy of a Fall,"A woman is suspected of her husband's murder, ...",0.328354
353,Licence to Kill,After capturing the notorious drug lord Franz ...,0.326357
1051,Hulk,"Bruce Banner, a genetics researcher with a tra...",0.326112
141,Despicable Me 2,Gru is recruited by the Anti-Villain League to...,0.325376
910,The Avengers,When an unexpected enemy emerges and threatens...,0.321899
1112,A History of Violence,An average family is thrust into the spotlight...,0.30948
1183,Suicide Squad,"From DC Comics comes the Suicide Squad, an ant...",0.30792
663,M,"In this classic German thriller, Hans Beckert,...",0.30096



Cluster 0 — 177 documenti
0_agent_man_world_case


Unnamed: 0,Title,Plot,Topic_Prob
40,Unknown,"A man awakens from a coma, only to discover th...",1.0
122,Deadpool 2,Wisecracking mercenary Deadpool battles the ev...,1.0
64,X2,Professor Charles Xavier and his team of genet...,1.0
70,Die Another Day,James Bond is sent to investigate the connecti...,1.0
113,Fight Club,A ticking-time-bomb insomniac and a slippery s...,1.0
102,Moonraker,After Drax Industries' Moonraker space shuttle...,1.0
106,Rear Window,A wheelchair-bound photographer spies on his n...,1.0
98,Strangers on a Train,"Having met on a train, a smooth-talking psycho...",1.0
331,The Judge,A successful lawyer returns to his hometown fo...,1.0
219,Rope,Two young men attempt to prove they committed ...,1.0



Cluster 1 — 99 documenti
1_woman_mother_home_house


Unnamed: 0,Title,Plot,Topic_Prob
45,Inside,Scarred for life after a harrowing near-death ...,1.0
84,Mamma Mia! Here We Go Again,"Five years after meeting her three fathers, So...",1.0
90,Incendies,A mother's last wishes send twins Jeanne and S...,1.0
433,Je Tu Il Elle,A woman suffers a subdued psychological breakd...,1.0
136,Coraline,Wandering her rambling old house in her boring...,1.0
206,The Grudge 2,A young woman encounters a malevolent supernat...,1.0
165,The Ward,"Kristen, a troubled young woman, is captured b...",1.0
263,Smile,"After witnessing a bizarre, traumatic incident...",1.0
225,Pan's Labyrinth,Living with her tyrannical stepfather in a new...,1.0
356,The Beyond,A young woman inherits an old hotel in Louisia...,1.0



Cluster 2 — 64 documenti
2_planet_earth_crew_face


Unnamed: 0,Title,Plot,Topic_Prob
62,Prometheus,A team of explorers discover a clue to the ori...,1.0
52,Alien³,After escaping with Newt and Hicks from the al...,1.0
97,Star Trek Into Darkness,When the crew of the Enterprise is called back...,1.0
66,Dawn of the Planet of the Apes,A group of scientists in San Francisco struggl...,1.0
646,The Thing,"In the winter of 1982, a twelve-man research t...",1.0
455,Alien Resurrection,"Two hundred years after Lt. Ripley died, a gro...",1.0
395,Star Trek,The fate of the galaxy rests in the hands of b...,1.0
670,Annihilation,"A biologist signs up for a dangerous, secret e...",1.0
478,The Cloverfield Paradox,"Orbiting above a planet on the brink of war, s...",1.0
493,WALL·E,What if mankind had to leave Earth and somebod...,1.0



Cluster 3 — 51 documenti
3_couple_york_road_night


Unnamed: 0,Title,Plot,Topic_Prob
4,I Hate Summer,Three families end up in the same rented house...,1.0
15,La Haine,After a chaotic night of rioting in a marginal...,1.0
60,Nothing Left to Do But Cry,Two 20th-century friends accidentally stumble ...,1.0
434,Last Vegas,"Aging pals Billy, Paddy, Archie, and Sam have ...",1.0
142,Past Lives,"After decades apart, childhood friends Nora an...",1.0
209,The Big Chill,Seven old college friends gather for a weekend...,1.0
414,Friends: The Reunion,The cast of Friends reunites for a once-in-a-l...,1.0
389,Three Identical Strangers,"New York, 1980. Three complete strangers accid...",1.0
1317,"The Adventures of Priscilla, Queen of the Desert",Two drag queens and a transgender woman contra...,1.0
1382,There's No Place Like Home,An extended family reunites after a long time ...,1.0



Cluster 4 — 47 documenti
4_quest_warrior_ring_power


Unnamed: 0,Title,Plot,Topic_Prob
151,Eragon,"In his homeland of Alagaesia, a farm boy happe...",1.0
114,Kung Fu Panda,"When the Valley of Peace is threatened, lazy P...",1.0
109,Aquaman,"Half-human, half-Atlantean Arthur Curry is tak...",1.0
198,Dragon Ball Super: Broly,Earth is peaceful following the Tournament of ...,1.0
164,Kingsman: The Golden Circle,When an attack on the Kingsman headquarters ta...,1.0
1236,Dragon Ball Z: Resurrection 'F',"One peaceful day on Earth, two remnants of Fri...",1.0
1140,The Lord of the Rings: The Two Towers,Frodo Baggins and the other members of the Fel...,1.0
491,How to Train Your Dragon: The Hidden World,As Hiccup fulfills his dream of creating a pea...,1.0
549,The Lord of the Rings: The Fellowship of the Ring,"Young hobbit Frodo Baggins, after inheriting a...",1.0
788,Dragon Ball DAIMA,Mysteriously transformed into mini versions of...,1.0



Cluster 5 — 45 documenti
5_story_film_rome_man


Unnamed: 0,Title,Plot,Topic_Prob
153,Fantozzi: White Collar Blues,A good-natured but unlucky Italian is constant...,1.0
221,Loro 2,"""Loro"", in two parts, is a period movie that c...",1.0
306,Rose Island,"In 1968, engineer Giorgio Rosa established the...",1.0
181,Welcome Mr. President!,In a small mountain village lives a man with a...,1.0
178,The Traitor,"Palermo, Sicily, 1980. Mafia member Tommaso Bu...",1.0
406,8½,"Guido Anselmi, a film director, finds himself ...",1.0
868,The Great Beauty,Jep Gambardella has seduced his way through th...,1.0
1098,Tear Along the Dotted Line,A cartoonist in Rome with his armadillo-for-a-...,1.0
590,Loro 1,"""Loro"", in two parts, is a period movie that c...",1.0
558,The Night Before the Exams Today,"In 2006, as World Cup fever sweeps Italy, high...",1.0



Cluster 6 — 39 documenti
6_car_fight_drug_turn


Unnamed: 0,Title,Plot,Topic_Prob
170,Rocky V,A lifetime of taking shots has ended Rocky’s c...,1.0
269,A Fish Called Wanda,While a diamond advocate attempts to steal a c...,1.0
546,Pineapple Express,A stoner and his dealer are forced to go on th...,1.0
519,Rocky III,After an intense fight with Clubber Lang and t...,1.0
486,Brawl in Cell Block 99,After working as a drug courier and getting in...,1.0
391,Rocky,An uneducated collector for a Philadelphia loa...,1.0
369,The Gentlemen,American expat Mickey Pearson has built a high...,1.0
1402,Savages,Pot growers Ben and Chon face off against the ...,1.0
1099,Rocky Balboa,"His wife is dead and his son hates him, but th...",1.0
1056,Snake Eyes,All bets are off when shady homicide cop Rick ...,1.0



Cluster 7 — 31 documenti
7_relationship_school_decides_john


Unnamed: 0,Title,Plot,Topic_Prob
33,Scott Pilgrim Takes Off,"Scott Pilgrim meets the girl of his dreams, Ra...",1.0
53,Unfaithful,Connie is a wife and mother whose 11-year marr...,1.0
140,Hannah Montana: The Movie,When Miley Stewart (aka pop-star Hannah Montan...,1.0
186,Sweetie,"The buttoned-down, superstitious Kay is attemp...",1.0
162,Dead Ringers,"Elliot, a successful gynecologist, works at th...",1.0
234,High School Musical 2,The East High Wildcats are gearing up for big ...,1.0
236,A Star Is Born,Seasoned musician Jackson Maine discovers — an...,1.0
417,Big Eyes,"In the late 1950s and early '60s, artist Walte...",1.0
267,A Walk to Remember,"When the popular, restless Landon Carter is fo...",1.0
271,The Idea of You,40-year-old single mom Solène begins an unexpe...,1.0



Cluster 8 — 24 documenti
8_group_strangers_room_escape


Unnamed: 0,Title,Plot,Topic_Prob
68,Fear Street: 1978,"In 1978, two rival groups at Camp Nightwing mu...",1.0
217,Circle,"In a massive, mysterious chamber, fifty strang...",1.0
261,Contagion,As an epidemic of a lethal airborne virus - th...,1.0
343,Dawn of the Dead,During an ever-growing epidemic of zombies tha...,1.0
368,[REC],A television reporter and cameraman follow eme...,1.0
376,Cabin Fever,A group of five college graduates rent a cabin...,1.0
397,Resident Evil: Apocalypse,"As the city is locked down under quarantine, A...",1.0
400,Night of the Living Dead,A group of strangers trapped in a farmhouse fi...,1.0
428,One Cut of the Dead,Real zombies arrive and terrorize the crew of ...,1.0
457,Demons,A group of people are trapped in a West Berlin...,1.0



Cluster 9 — 22 documenti
9_princess_far_land_true


Unnamed: 0,Title,Plot,Topic_Prob
175,Barbie,Barbie and Ken are having the time of their li...,1.0
201,The Magic Flute,The Queen of the Night enlists a handsome prin...,1.0
235,Frozen,Young princess Anna of Arendelle dreams about ...,1.0
238,Sleeping Beauty,Cursed to die by the evil fairy Maleficent whe...,1.0
379,Snow White and the Seven Dwarfs,"A beautiful girl, Snow White, takes refuge in ...",1.0
390,Beauty and the Beast,"Follow the adventures of Belle, a bright young...",1.0
394,Bambi,Bambi's tale unfolds from season to season as ...,1.0
430,Shrek,It ain't easy bein' green -- especially if you...,1.0
481,Shrek Forever After,A bored and domesticated Shrek pacts with deal...,1.0
484,Aladdin,"In the boorish city of Agrabah, kind-hearted s...",1.0



Cluster 10 — 21 documenti
10_decision_begins_love_greater


Unnamed: 0,Title,Plot,Topic_Prob
46,Humanist Vampire Seeking Consenting Suicidal P...,Sasha is a young vampire with a serious proble...,1.0
65,After We Fell,"Just as Tessa's life begins to become unglued,...",1.0
341,After We Collided,Tessa finds herself struggling with her compli...,1.0
351,The Twilight Saga: Breaking Dawn – Part 2,"After the birth of Renesmee, the Cullens gathe...",1.0
354,Black Swan,"The story of Nina, a ballerina in a New York C...",1.0
442,After Ever Happy,As a shocking truth about a couple's families ...,1.0
475,After Everything,Besieged by writer’s block and the crushing br...,1.0
498,Roman Holiday,"Overwhelmed by her suffocating schedule, touri...",1.0
536,The Outside,"Longing to fit in at work, awkward Stacey begi...",1.0
543,Repulsion,Beautiful young manicurist Carole suffers from...,1.0



Cluster 11 — 20 documenti
11_attempts_success_years_night


Unnamed: 0,Title,Plot,Topic_Prob
57,Death Sentence,Nick Hume is a mild-mannered executive with a ...,1.0
108,Soul,Joe Gardner is a middle school teacher with a ...,1.0
250,The King of Comedy,Aspiring comic Rupert Pupkin attempts to achie...,1.0
319,Network,When veteran anchorman Howard Beale is forced ...,1.0
374,The Social Network,"In 2003, Harvard undergrad and computer progra...",1.0
388,The Intern,70-year-old widower Ben Whittaker has discover...,1.0
439,Green Book,"Tony Lip, a bouncer in 1962, is hired to drive...",1.0
482,Shaun of the Dead,"Shaun lives a supremely uneventful life, which...",1.0
492,The Devil Wears Prada,Andy moves to New York to work in the fashion ...,1.0
615,17 Again,"On the brink of a midlife crisis, 30-something...",1.0
