# Import libraries and setup git root

In [81]:
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.cluster import KMeans
import torch
import numpy as np
import pandas as pd
import re
import fastparquet ## make sure environment has fastparquet and NOT pyarrow
from itertools import chain

In [11]:
import os

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())
root = root.replace('\\', '/')
print(root)


c:/Users/fitsl/Documents/Programming/UVM Programming Classes/PoCS/pocs_project


# Import the comments data

In [None]:
df = pd.read_parquet(f"{root}/Data/Whole_sets/merged_tropes_comments.parquet")

In [13]:
df.head(2)

Unnamed: 0,letterboxd_search,comments,reviews_extracted,review_stars,review_dates,genres,avg_rating,directors,Year,url,Title,imdb_,nu_tropes,double_first_number
0,https://letterboxd.com/search/Puerta+De+Hierro...,"[{'stars': '★★★★', 'review': 'Yo no entendí bi...","[Yo no entendí bien la película, Perón sabía q...","[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8.0...","[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Aug...",[],6.6,"[Víctor Laplace, Dieguillo Fernández]",2013,https://letterboxd.com/film/puerta-de-hierro-e...,Puerta De Hierro,2512204.0,"[ ArgentineMedia, BlackMagic, ChekhovsGun, Che...",6.6
1,https://letterboxd.com/search/Better+Off+Dead....,"[{'stars': '★★★★½', 'review': ""this fucking gu...",[this fucking guy took a shower with socks on ...,"[9.0, 10.0, 8.0, 8.0, 10.0, 10.0, 9.0, 8.0, 7....","[18 Dec 2021, 06 Apr 2016, 29 Feb 2020, 25 Aug...","[Romance, Comedy]",7.08,[Savage Steve Holland],1985,https://letterboxd.com/film/better-off-dead/,Better Off Dead...,88794.0,"[ AbhorrentAdmirer, AcceptableTargets, ActorAl...",7.08


# Try some some code for bert generally

In [14]:
# Load BERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Sample documents
documents = df['reviews_extracted'].iloc[0]

# Tokenize and get BERT embeddings for each document
def get_bert_embeddings(documents):
    embeddings = []
    for doc in documents:
        inputs = tokenizer(doc, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        # Take the embeddings of [CLS] token (first token)
        doc_embedding = outputs.last_hidden_state[0][0].numpy()
        embeddings.append(doc_embedding)
    return np.array(embeddings)

# Obtain embeddings
embeddings = get_bert_embeddings(documents)

# Perform K-Means clustering
num_topics = 2  # Define the number of topics
kmeans = KMeans(n_clusters=num_topics)
kmeans.fit(embeddings)

# Print the cluster assignments
for i, label in enumerate(kmeans.labels_):
    print(f"Document {i}: Cluster {label}")

# Cluster centers (topics)
print("Cluster Centers (Topics):")
print(kmeans.cluster_centers_)

  super()._check_params_vs_input(X, default_n_init=10)


Document 0: Cluster 0
Document 1: Cluster 0
Document 2: Cluster 0
Document 3: Cluster 0
Document 4: Cluster 1
Document 5: Cluster 1
Document 6: Cluster 0
Document 7: Cluster 1
Document 8: Cluster 0
Document 9: Cluster 1
Document 10: Cluster 0
Document 11: Cluster 0
Document 12: Cluster 1
Document 13: Cluster 0
Document 14: Cluster 1
Document 15: Cluster 0
Document 16: Cluster 0
Document 17: Cluster 1
Document 18: Cluster 0
Document 19: Cluster 0
Document 20: Cluster 0
Document 21: Cluster 0
Document 22: Cluster 0
Document 23: Cluster 1
Document 24: Cluster 0
Document 25: Cluster 1
Document 26: Cluster 1
Document 27: Cluster 1
Document 28: Cluster 1
Document 29: Cluster 1
Document 30: Cluster 1
Document 31: Cluster 1
Document 32: Cluster 1
Document 33: Cluster 0
Document 34: Cluster 0
Document 35: Cluster 0
Document 36: Cluster 0
Document 37: Cluster 0
Document 38: Cluster 0
Document 39: Cluster 0
Document 40: Cluster 1
Document 41: Cluster 1
Document 42: Cluster 0
Document 43: Cluster 

In [15]:
df.sort_values(by='avg_rating', ascending=False, inplace=True)
df

Unnamed: 0,letterboxd_search,comments,reviews_extracted,review_stars,review_dates,genres,avg_rating,directors,Year,url,Title,imdb_,nu_tropes,double_first_number
4905,https://letterboxd.com/search/Harakiri+1962/,"[{'stars': '★★★★★', 'review': 'I am an idiot.W...",[I am an idiot.Why is it that I still dread wa...,"[10.0, 10.0, None, 10.0, 10.0, 10.0, None, 9.0...","[01 Dec 2013, , 26 Jan 2024, 21 Jun 2021, 06 S...","[History, Drama, Action]",9.38,[Masaki Kobayashi],1962,https://letterboxd.com/film/harakiri/,Harakiri,56058.0,"[ AHandfulForAnEye, AnAesop, ArchEnemy, BestSe...",9.38
11893,https://letterboxd.com/search/Stop+Making+Sens...,"[{'stars': '★★★★★', 'review': 'SAME AS IT EVER...",[SAME AS IT EVER WASSAME AS IT EVER WASSAME AS...,"[10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10....","[27 Mar 2020, 18 Sep 2023, 06 Aug 2013, 13 Jul...","[Music, Documentary]",9.36,[Jonathan Demme],1984,https://letterboxd.com/film/stop-making-sense/,Stop Making Sense,88178.0,"[ AlbumTitleDrop, AmericanMusic, AndNowForSome...",9.36
11281,https://letterboxd.com/search/Fullmetal+Alchem...,"[{'stars': '★★★★', 'review': 'this is one fuck...","[this is one fucken long ass movie, *Fullmetal...","[8.0, 10.0, 8.0, 10.0, 10.0, 10.0, 10.0, 10.0,...","[06 Sep 2016, , 16 Feb 2022, 15 Apr 2020, 21 M...","[Comedy, Animation]",9.32,"[Takahiro Ikezoe, Kiyomitsu Sato, Hiroshi Ikeh...",2009,https://letterboxd.com/film/fullmetal-alchemis...,Fullmetal Alchemist,1355642.0,"[ AdaptationDyeJob, AdaptationPersonalityChang...",9.32
75,https://letterboxd.com/search/Twelve+Angry+Men...,"[{'stars': '★★★★★', 'review': ""That was the be...",[That was the best 1.5 hours of middle aged wh...,"[10.0, 10.0, 8.0, 9.0, 10.0, 10.0, 10.0, 10.0,...","[23 Nov 2014, 28 Mar 2018, 01 Aug 2020, 29 Jun...",[Drama],9.26,[Sidney Lumet],1957,https://letterboxd.com/film/12-angry-men/,Twelve Angry Men,50083.0,"[ ALighterShadeOfGrey, AbusiveParents, Aggress...",9.26
10995,https://letterboxd.com/search/Twelve+Twelve+Tw...,"[{'stars': '★★★★★', 'review': ""That was the be...",[That was the best 1.5 hours of middle aged wh...,"[10.0, 10.0, 8.0, 9.0, 10.0, 10.0, 10.0, 10.0,...","[23 Nov 2014, 28 Mar 2018, 01 Aug 2020, 29 Jun...",[Drama],9.26,[Sidney Lumet],1957,https://letterboxd.com/film/12-angry-men/,Twelve Twelve Twelve,50083.0,"[ AgainstMyReligion, BMovie, BedMateReveal, Be...",9.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12058,https://letterboxd.com/search/Axeman+2013/,"[{'stars': '★', 'review': ""And I guess they ma...",[And I guess they made a re-make of this..Hope...,"[2.0, 4.0, 3.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, ...","[15 Aug 2022, 19 Jan 2022, 10 Aug 2022, 14 May...",[Horror],,[Joston Theney],2013,https://letterboxd.com/film/axeman-at-cutters-...,Axeman,2357866.0,"[ AnAxeToGrind, AntagonistTitle, BMovie, BigBa...",
12060,https://letterboxd.com/search/Torn+Dark+Bullet...,[],[],[],[],[Thriller],,[],2020,https://letterboxd.com/film/torn-dark-bullets/,Torn Dark Bullets,8386654.0,"[ BewareTheNiceOnes, CanadianMovies, DiesWideO...",
12062,https://letterboxd.com/search/The+Running+Man+...,[],[],[],[],"[Science Fiction, Thriller]",,[Edgar Wright],2025,https://letterboxd.com/film/the-running-man-2025/,The Running Man,14107334.0,"[ AMatchMadeInStockholm, ATasteOfTheirOwnMedic...",
12071,https://letterboxd.com/search/Wonder+Seven+1994/,"[{'stars': '★★★★', 'review': ""This film is so ...",[This film is so fucking good! It's like a Joh...,"[8.0, 6.0, 6.0, 4.0, 6.0, 4.0, 7.0, 7.0, 9.0, ...","[22 Feb 2024, 15 Mar 2023, 25 Mar 2023, 13 Jan...",[Action],,[Tony Ching Siu-Tung],1994,https://letterboxd.com/film/wonder-seven/,Wonder Seven,110923.0,"[ ActionGirl, ActionPrologue, BadassBiker, Bar...",


# BerTopic gives more inutitive topics

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer


# sort to get the top reviews
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# chain from itertools to get the reviews into a bigger list
documents = list(chain(*df['reviews_extracted'].iloc[:50]))

# init bert and fit the model on the docs
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
vectorizer_model = CountVectorizer(stop_words='english', max_features=1000)
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    embedding_model=embedding_model,
    verbose=True
    )
__, __ = topic_model.fit_transform(documents)

#Get the topic information and 
df_topics = topic_model.get_topic_info()
df_topics.head()



2024-11-23 12:26:07,070 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 150/150 [00:45<00:00,  3.31it/s]
2024-11-23 12:26:53,438 - BERTopic - Embedding - Completed ✓
2024-11-23 12:26:53,439 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-23 12:26:55,087 - BERTopic - Dimensionality - Completed ✓
2024-11-23 12:26:55,088 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-23 12:26:55,293 - BERTopic - Cluster - Completed ✓
2024-11-23 12:26:55,298 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-23 12:26:55,617 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1167,-1_just_like_time_movie,"[just, like, time, movie, film, feel, im, watc...",[100-word review: Although thisReduxmakes me w...
1,0,88,0_que_um_uma_se,"[que, um, uma, se, filme, la, como, mais, por,...","[Se a Mise en Scene é, de fato, o lugar onde r..."
2,1,82,1_sand_dunes_woman_desert,"[sand, dunes, woman, desert, local, survive, e...","[Woman in the Dunesis an incredibly somber, an..."
3,2,78,2_andrei_tarkovsky_rublev_artist,"[andrei, tarkovsky, rublev, artist, art, relig...",[“Andrei Rublev” – Russian Renaissance in Sovi...
4,3,75,3_war_horror_antiwar_powerful,"[war, horror, antiwar, powerful, movie, come, ...",[“The Horror…. The Horror.”Words can’t even de...


Now turn it into a function


In [87]:
def topic_model(documents):

    # init bert and fit the model on the docs
    vectorizer_model = CountVectorizer(stop_words='english', max_features=1000)
    topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True)
    __, __ = topic_model.fit_transform(documents)

    df_topics = topic_model.get_topic_info()
    return df_topics

In [90]:
df_topics = topic_model(list(chain(*df['reviews_extracted'].iloc[:800])))
df_topics.head()

2024-11-23 12:31:56,035 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2396/2396 [14:23<00:00,  2.77it/s]
2024-11-23 12:46:22,708 - BERTopic - Embedding - Completed ✓
2024-11-23 12:46:22,708 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-23 12:46:42,747 - BERTopic - Dimensionality - Completed ✓
2024-11-23 12:46:42,749 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-23 12:46:49,023 - BERTopic - Cluster - Completed ✓
2024-11-23 12:46:49,045 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-23 12:46:53,278 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28516,-1_peak_just_love_movie,"[peak, just, love, movie, im, like, good, watc...","[Taking place in Allied-occupied Germany, Ross..."
1,0,877,0_um_uma_filme_com,"[um, uma, filme, com, em, os, mais, da, na, mas]","[""O Tango de Satã"" é um filme do Béla Tarr que..."
2,1,792,1_el_en_una_la,"[el, en, una, la, pelcula, es, lo, las, los, ms]",[Una visión cruda e incómoda de la población m...
3,2,561,2_memories_memory_existence_past,"[memories, memory, existence, past, fear, deat...","[Doubt lurks behind every step, and every conc..."
4,3,542,3_youre_hold_know_want,"[youre, hold, know, want, dont, miss, live, lo...","[""You talk of showing things... I don't know, ..."


In [92]:
df_topics.head(15)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28516,-1_peak_just_love_movie,"[peak, just, love, movie, im, like, good, watc...","[Taking place in Allied-occupied Germany, Ross..."
1,0,877,0_um_uma_filme_com,"[um, uma, filme, com, em, os, mais, da, na, mas]","[""O Tango de Satã"" é um filme do Béla Tarr que..."
2,1,792,1_el_en_una_la,"[el, en, una, la, pelcula, es, lo, las, los, ms]",[Una visión cruda e incómoda de la población m...
3,2,561,2_memories_memory_existence_past,"[memories, memory, existence, past, fear, deat...","[Doubt lurks behind every step, and every conc..."
4,3,542,3_youre_hold_know_want,"[youre, hold, know, want, dont, miss, live, lo...","[""You talk of showing things... I don't know, ..."
5,4,540,4_und_der_ich_den,"[und, der, ich, den, zu, ein, ist, das, wie, mit]",[City of Life and Death wird häufig mit Schind...
6,5,302,5_et_le_des_est,"[et, le, des, est, plus, la, mais, en, si, que]",[Beaucoup de longueurs et de répétitions avant...
7,6,255,6_wars_star_trilogy_george,"[wars, star, trilogy, george, episode, origina...",[I would love to see a whole Star Wars movie f...
8,7,250,7_crying_tears_sad_emotions,"[crying, tears, sad, emotions, started, heartb...",[I'm crying after laughing so hard. WHAT A PER...
9,8,247,8_soundtrack_musical_music_concert,"[soundtrack, musical, music, concert, song, so...","[midsommar (2019) if it was a horny musical, T..."


Works okay, but topics are largely jus specific movies; we want NER to make it more robust.

# Named Entity Recognition

In [27]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = " Straight up bologna cake. Even Gmork from The Neverending Story in the 80s was a more believable and realistic wolf."

ner_results = nlp(example)
print(ner_results)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9953448, 'index': 9, 'word': 'G', 'start': 32, 'end': 33}, {'entity': 'I-PER', 'score': 0.8693475, 'index': 10, 'word': '##mor', 'start': 33, 'end': 36}, {'entity': 'I-PER', 'score': 0.82671, 'index': 11, 'word': '##k', 'start': 36, 'end': 37}, {'entity': 'B-MISC', 'score': 0.9859154, 'index': 13, 'word': 'The', 'start': 43, 'end': 46}, {'entity': 'I-MISC', 'score': 0.9771259, 'index': 14, 'word': 'Never', 'start': 47, 'end': 52}, {'entity': 'I-MISC', 'score': 0.9806553, 'index': 15, 'word': '##ending', 'start': 52, 'end': 58}, {'entity': 'I-MISC', 'score': 0.9830625, 'index': 16, 'word': 'Story', 'start': 59, 'end': 64}]


In [105]:
def replace_named_entities_with_bert(text, nlp_ner):
    # Use BERT NER pipeline to extract named entities
    entities = nlp_ner(text)
    # Replace each recognized entity with a placeholder
    # Sort entities by their position (from end to start)
    for entity in sorted(entities, key=lambda x: x['start'], reverse=True):
        text = text[:entity['start']] + f"[{entity['entity']}]"+ text[entity['end']:]

    text = re.sub(r'\[B-([A-Z]+)\](\s*\[I-\1\])+', r'[\1]', text)
    text = re.sub(r'\[[BI]-(\w+)\]', r'[\1]', text)


    # text = re.sub(r'\[B-|I-](PER|ORG|LOC|MISC)\]', r'[\1]', text)
    # print(text)
    return text

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Load the pre-trained NER model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Sample dataframe structure (make sure your dataframe is set correctly)
try:
    documents = df.loc[df['Title'] == 'Alien', 'reviews_extracted']
    documents = list(chain.from_iterable(documents))
    
    # Apply entity replacement on each document
    masked_documents = [replace_named_entities_with_bert(doc, nlp) for doc in documents]

    # Print the original and masked documents
    for mdoc, doc in zip(masked_documents, documents):
        print(doc, '\n', mdoc, '\n\n')
except IndexError:
    print("No documents found with the title 'Alien'. Please check the dataframe.")


Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


I'm starting to feel like the Weyland-Yutani Corporation does not have our best interests at heart. 
 I'm starting to feel like the [ORG] does not have our best interests at heart. 


every time they bullied andy i wanted to punch a hole thru the fucking screen 
 every time they bullied andy i wanted to punch a hole thru the fucking screen 


I can just feel the people who are getting ready to complain about "fan service", but I'm a fan and I wasfucking SERVICED! 
 I can just feel the people who are getting ready to complain about "fan service", but I'm a fan and I wasfucking SERVICED! 


things not to bring into space• pregnant women• british people 
 things not to bring into space• pregnant women• british people 


Priscilla Presley goes to space, only this time, the Xenomorphs are so much meaner than Elvis. 
 [PER][PER] goes to space, only this time, the [MISC] are so much meaner than [PER]. 


Uncanny CGI recreation of the dead is arguablyscarier than any xenomorph 
 Uncanny [MISC]

Run the above on the dataframe as a whole

In [110]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

df_small = df.sort_values(by='avg_rating', ascending=False).iloc[:50].copy()
df_small['NER_reviews'] = df_small['reviews_extracted'].apply(
    lambda x: [replace_named_entities_with_bert(doc, nlp) for doc in x]
)


Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [109]:
df_small['NER_reviews'].iloc[1]

['SAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WAS',
 'sweat my ass off dancing from “slippery people” on in a big suit at a vidiots screening with the band (inc. [PER]nah holt, [PER]ynn mabry, and steve scales) in attendance. [PER]ynn and ednah joined us and did laps around the audience. [PER]a weymouth gave me a near empty glass of champagne and said my suit was nice. i took two gummies and drank a grapefruit radler. vidiots cofounder saw me hitting the choreo and asked “how many times have you…',
 'A polite man is driven to murder. He becomes a prophet and screams manifestos on love, war, and the increasingly alarming impact of technology and progress. Driven to insanity by his own insights into the human condition, he travels to a river in an attempt to drown himself but instead is baptized and absolved of sin. He dies, crosseyed yet painless.This is the defini

# Now we try NER Topic Modelling

In [None]:
df_topics = topic_model(list(chain(*df['NER_reviews'].iloc[:800])))
df_topics.head()    