# Import libraries and setup git root

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import fastparquet ## make sure environment has fastparquet and NOT pyarrow
import itertools


In [5]:
import os

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())
root = root.replace('\\', '/')
print(root)


c:/Users/fitsl/Documents/Programming/UVM Programming Classes/PoCS/pocs_project


# Import the comments data

In [None]:
df = pd.read_parquet(f"{root}/Data/Whole_sets/merged_tropes_comments.parquet")

In [7]:
df.head(2)

Unnamed: 0,letterboxd_search,comments,reviews_extracted,review_stars,review_dates,genres,avg_rating,directors,Year,url,Title,imdb_,nu_tropes,double_first_number
0,https://letterboxd.com/search/Puerta+De+Hierro...,"[{'stars': '★★★★', 'review': 'Yo no entendí bi...","[Yo no entendí bien la película, Perón sabía q...","[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8.0...","[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Aug...",[],6.6,"[Víctor Laplace, Dieguillo Fernández]",2013,https://letterboxd.com/film/puerta-de-hierro-e...,Puerta De Hierro,2512204.0,"[ ArgentineMedia, BlackMagic, ChekhovsGun, Che...",6.6
1,https://letterboxd.com/search/Better+Off+Dead....,"[{'stars': '★★★★½', 'review': ""this fucking gu...",[this fucking guy took a shower with socks on ...,"[9.0, 10.0, 8.0, 8.0, 10.0, 10.0, 9.0, 8.0, 7....","[18 Dec 2021, 06 Apr 2016, 29 Feb 2020, 25 Aug...","[Romance, Comedy]",7.08,[Savage Steve Holland],1985,https://letterboxd.com/film/better-off-dead/,Better Off Dead...,88794.0,"[ AbhorrentAdmirer, AcceptableTargets, ActorAl...",7.08


# Try some some code for bert generally

In [8]:
# Load BERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Sample documents
documents = df['reviews_extracted'].iloc[0]

# Tokenize and get BERT embeddings for each document
def get_bert_embeddings(documents):
    embeddings = []
    for doc in documents:
        inputs = tokenizer(doc, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        # Take the embeddings of [CLS] token (first token)
        doc_embedding = outputs.last_hidden_state[0][0].numpy()
        embeddings.append(doc_embedding)
    return np.array(embeddings)

# Obtain embeddings
embeddings = get_bert_embeddings(documents)

# Perform K-Means clustering
num_topics = 2  # Define the number of topics
kmeans = KMeans(n_clusters=num_topics)
kmeans.fit(embeddings)

# Print the cluster assignments
for i, label in enumerate(kmeans.labels_):
    print(f"Document {i}: Cluster {label}")

# Cluster centers (topics)
print("Cluster Centers (Topics):")
print(kmeans.cluster_centers_)

Document 0: Cluster 0
Document 1: Cluster 0
Document 2: Cluster 0
Document 3: Cluster 0
Document 4: Cluster 1
Document 5: Cluster 1
Document 6: Cluster 0
Document 7: Cluster 1
Document 8: Cluster 0
Document 9: Cluster 1
Document 10: Cluster 0
Document 11: Cluster 0
Document 12: Cluster 1
Document 13: Cluster 0
Document 14: Cluster 1
Document 15: Cluster 0
Document 16: Cluster 0
Document 17: Cluster 1
Document 18: Cluster 0
Document 19: Cluster 0
Document 20: Cluster 0
Document 21: Cluster 0
Document 22: Cluster 0
Document 23: Cluster 1
Document 24: Cluster 0
Document 25: Cluster 1
Document 26: Cluster 1
Document 27: Cluster 1
Document 28: Cluster 1
Document 29: Cluster 1
Document 30: Cluster 1
Document 31: Cluster 1
Document 32: Cluster 1
Document 33: Cluster 0
Document 34: Cluster 0
Document 35: Cluster 0
Document 36: Cluster 0
Document 37: Cluster 0
Document 38: Cluster 0
Document 39: Cluster 0
Document 40: Cluster 1
Document 41: Cluster 1
Document 42: Cluster 0
Document 43: Cluster 

In [9]:
df.sort_values(by='avg_rating', ascending=False, inplace=True)
df

Unnamed: 0,letterboxd_search,comments,reviews_extracted,review_stars,review_dates,genres,avg_rating,directors,Year,url,Title,imdb_,nu_tropes,double_first_number
4905,https://letterboxd.com/search/Harakiri+1962/,"[{'stars': '★★★★★', 'review': 'I am an idiot.W...",[I am an idiot.Why is it that I still dread wa...,"[10.0, 10.0, None, 10.0, 10.0, 10.0, None, 9.0...","[01 Dec 2013, , 26 Jan 2024, 21 Jun 2021, 06 S...","[History, Drama, Action]",9.38,[Masaki Kobayashi],1962,https://letterboxd.com/film/harakiri/,Harakiri,56058.0,"[ AHandfulForAnEye, AnAesop, ArchEnemy, BestSe...",9.38
11893,https://letterboxd.com/search/Stop+Making+Sens...,"[{'stars': '★★★★★', 'review': 'SAME AS IT EVER...",[SAME AS IT EVER WASSAME AS IT EVER WASSAME AS...,"[10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10....","[27 Mar 2020, 18 Sep 2023, 06 Aug 2013, 13 Jul...","[Music, Documentary]",9.36,[Jonathan Demme],1984,https://letterboxd.com/film/stop-making-sense/,Stop Making Sense,88178.0,"[ AlbumTitleDrop, AmericanMusic, AndNowForSome...",9.36
11281,https://letterboxd.com/search/Fullmetal+Alchem...,"[{'stars': '★★★★', 'review': 'this is one fuck...","[this is one fucken long ass movie, *Fullmetal...","[8.0, 10.0, 8.0, 10.0, 10.0, 10.0, 10.0, 10.0,...","[06 Sep 2016, , 16 Feb 2022, 15 Apr 2020, 21 M...","[Comedy, Animation]",9.32,"[Takahiro Ikezoe, Kiyomitsu Sato, Hiroshi Ikeh...",2009,https://letterboxd.com/film/fullmetal-alchemis...,Fullmetal Alchemist,1355642.0,"[ AdaptationDyeJob, AdaptationPersonalityChang...",9.32
76,https://letterboxd.com/search/Twelve+Angry+Men...,"[{'stars': '★★★★★', 'review': ""That was the be...",[That was the best 1.5 hours of middle aged wh...,"[10.0, 10.0, 8.0, 9.0, 10.0, 10.0, 10.0, 10.0,...","[23 Nov 2014, 28 Mar 2018, 01 Aug 2020, 29 Jun...",[Drama],9.26,[Sidney Lumet],1957,https://letterboxd.com/film/12-angry-men/,Twelve Angry Men,50083.0,"[ AngryBlackMan, FilmsOf19951999, HiddenDepths...",9.26
10995,https://letterboxd.com/search/Twelve+Twelve+Tw...,"[{'stars': '★★★★★', 'review': ""That was the be...",[That was the best 1.5 hours of middle aged wh...,"[10.0, 10.0, 8.0, 9.0, 10.0, 10.0, 10.0, 10.0,...","[23 Nov 2014, 28 Mar 2018, 01 Aug 2020, 29 Jun...",[Drama],9.26,[Sidney Lumet],1957,https://letterboxd.com/film/12-angry-men/,Twelve Twelve Twelve,50083.0,"[ AgainstMyReligion, BMovie, BedMateReveal, Be...",9.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12058,https://letterboxd.com/search/Axeman+2013/,"[{'stars': '★', 'review': ""And I guess they ma...",[And I guess they made a re-make of this..Hope...,"[2.0, 4.0, 3.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, ...","[15 Aug 2022, 19 Jan 2022, 10 Aug 2022, 14 May...",[Horror],,[Joston Theney],2013,https://letterboxd.com/film/axeman-at-cutters-...,Axeman,2357866.0,"[ AnAxeToGrind, AntagonistTitle, BMovie, BigBa...",
12060,https://letterboxd.com/search/Torn+Dark+Bullet...,[],[],[],[],[Thriller],,[],2020,https://letterboxd.com/film/torn-dark-bullets/,Torn Dark Bullets,8386654.0,"[ BewareTheNiceOnes, CanadianMovies, DiesWideO...",
12062,https://letterboxd.com/search/The+Running+Man+...,[],[],[],[],"[Science Fiction, Thriller]",,[Edgar Wright],2025,https://letterboxd.com/film/the-running-man-2025/,The Running Man,14107334.0,"[ AMatchMadeInStockholm, ATasteOfTheirOwnMedic...",
12071,https://letterboxd.com/search/Wonder+Seven+1994/,"[{'stars': '★★★★', 'review': ""This film is so ...",[This film is so fucking good! It's like a Joh...,"[8.0, 6.0, 6.0, 4.0, 6.0, 4.0, 7.0, 7.0, 9.0, ...","[22 Feb 2024, 15 Mar 2023, 25 Mar 2023, 13 Jan...",[Action],,[Tony Ching Siu-Tung],1994,https://letterboxd.com/film/wonder-seven/,Wonder Seven,110923.0,"[ ActionGirl, ActionPrologue, BadassBiker, Bar...",


# bertopic library gives more intuitive topic representations

In [10]:
# documents = df['reviews_extracted'].loc[df['Title'] == 'Stop Making Sense']
documents = df['reviews_extracted'].loc[11893]
documents
# for i, doc in enumerate(documents):c
#     print(i)
#     if not isinstance(doc, str): print(doc)

['SAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WAS',
 'sweat my ass off dancing from “slippery people” on in a big suit at a vidiots screening with the band (inc. ednah holt, lynn mabry, and steve scales) in attendance. lynn and ednah joined us and did laps around the audience. tina weymouth gave me a near empty glass of champagne and said my suit was nice. i took two gummies and drank a grapefruit radler. vidiots cofounder saw me hitting the choreo and asked “how many times have you…',
 'A polite man is driven to murder. He becomes a prophet and screams manifestos on love, war, and the increasingly alarming impact of technology and progress. Driven to insanity by his own insights into the human condition, he travels to a river in an attempt to drown himself but instead is baptized and absolved of sin. He dies, crosseyed yet painless.This is the definitive fairytal

In [37]:
import itertools
from transformers import pipeline
entity_recognition_nlp  = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")


def replace_named_entities_with_bert(text, nlp_ner):
    # Use BERT NER pipeline to extract named entities
    entities = nlp_ner(text)
    
    # Replace each recognized entity with a placeholder
    for entity in entities:
        text = text.replace(entity['word'], f"[{entity['entity']}]")
        
    return text
index = df[df['Title'] == 'Alien'].index[0]
documents = df['reviews_extracted'].iloc[index:index+1]
documents = list(itertools.chain.from_iterable(documents))
masked_documents = [replace_named_entities_with_bert(doc, entity_recognition_nlp) for doc in documents]
for mdoc, doc in zip(masked_documents,documents):
    print(mdoc, '\n', doc, '\n\n')

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Damn! This started out so interesting with a couple biologists going into the woods and finding "something".But then there's the crushing disappointment when the story instead cuts to a bunch of asshole teens/twenty-somethings. I have no idea why filmmakers always want to show a group of "friends" who apparently HATE each other in these found footage films. And the usual trope of the cameraman (who the audience can never figure out why he's filming anything that's happening) not saying anything unless it's mean or unpleasant.Didn't take long after that for me to completely lose interst. 
 Damn! This started out so interesting with a couple biologists going into the woods and finding "something".But then there's the crushing disappointment when the story instead cuts to a bunch of asshole teens/twenty-somethings. I have no idea why filmmakers always want to show a group of "friends" who apparently HATE each other in these found footage films. And the usual trope of the cameraman (who th

In [19]:
print(torch.cuda.is_available()) 

False


In [35]:
import spacy
entity_recognition_nlp = spacy.load("en_core_web_sm") 

def replace_named_entities(text, nlp):
    doc = nlp(text)
    for ent in doc.ents:
        text = text.replace(ent.text, f"[{ent.label_}]")
    return text

documents = df['reviews_extracted'].iloc[:1]
masked_documents = [replace_named_entities(doc, entity_recognition_nlp) for doc in documents]
documents = list(itertools.chain.from_iterable(documents))

for doc in masked_documents:
    print(doc)


ModuleNotFoundError: No module named 'spacy'

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import torch



documents = df['reviews_extracted'].iloc[:500]
documents = list(itertools.chain.from_iterable(documents))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
embedding_model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# Initialize the CountVectorizer (already using stop words)
vectorizer_model = CountVectorizer(stop_words="english")

seed_topic_list = []
# seed_topic_list = [ 
#     ['movie', 'film', 'cinema'],
#     ['scary', 'spooky', 'eek', 'horror'],
#     ['laugh', 'joke' 'comedy', 'funny']
# ]


# Initialize BERTopic with the custom embedding model (GPU-enabled model)
topic_model = BERTopic(
    embedding_model=embedding_model, 
    vectorizer_model=vectorizer_model, 
    # seed_topic_list=seed_topic_list 
    )
topics, probs = topic_model.fit_transform(documents)
df = topic_model.get_topic_info()
df.head(10)

#outputs as a dataframe which is nice. Note that "-1" means outliers; not topics

cpu


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 0 and the array at index 1 has size 384

In [None]:
from itertools import chain
# the top 100 best reviewed movies we have in our trope set
documents = list(chain(*df['reviews_extracted'].iloc[:300]))
topics, probs = topic_model.fit_transform(documents)
df_topics = topic_model.get_topic_info()

In [None]:
# df_topics.to_csv(f"{root}/Data/Topic Modelling/top_100_best_rated_movies_topics.csv")

In [None]:
df_topics.head(10)