# Import libraries and setup git root

In [1]:
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.cluster import KMeans
import torch
import numpy as np
import pandas as pd
import re
# import fastparquet ## make sure environment has fastparquet and NOT pyarrow
from itertools import chain

In [2]:
import os

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())
root = root.replace('\\', '/')
print(root)


c:/Users/fitsl/Documents/Programming/UVM Programming Classes/PoCS/pocs_project


# Import the comments data

In [3]:
df = pd.read_csv(f"{root}/Data/2020_trope_data/Scraped_Data/NER_parsed_reviews.csv")

In [4]:
df.head(2)

Unnamed: 0,letterboxd_search,url,actors,roles,studio,reviews_extracted,review_stars,review_dates,letterboxd_directors,letterboxd_year,letterboxd_rating,letterboxd_genres,NameIMDB,IMDB_rating,IMDB_ID,letter_USD_Budget,letter_US_Gross,letter_WW_Gross,NER_cleaned_reviews
0,https://letterboxd.com/search/ABCs+of+Death+2+...,https://letterboxd.com/film/abcs-of-death-2/,"['Eric Jacobus', 'Julian Barratt', 'Ian Virgo'...","['Assassin (Segment ""Amateur"")', 'Peter Toland...","['Drafthouse Films', 'Timpson Films']","['To the creators of ""P is for P-P-P-P SCARY!""...","[6, 2, 7, 4, 9, 7, 5, 6, 4, 7, 6, 4, 6, 7, 2, ...","['14 Feb 2024', '16 Apr 2022', '17 Oct 2018', ...","['Rodney Ascher', 'Bill Plympton', 'Erik Matti...",2014,5.48,['Horror'],ABCs of Death 2,5.4,tt2926810,,,7171.0,['To the creators of P is for PPPP SCARY!Cmon ...
1,https://letterboxd.com/search/A+Beautiful+Day+...,https://letterboxd.com/film/a-beautiful-day-in...,"['Matthew Rhys', 'Tom Hanks', 'Chris Cooper', ...","['Lloyd Vogel', 'Fred Rogers', 'Jerry Vogel', ...","['TriStar Pictures', 'Tencent Pictures', 'Big ...",['therapy: expensivetom hanks as mr rogers bre...,"[6, 7, 7, 6, 7, 8, 8, 8, 8, 6, 6, 8, 8, 8, 8, ...","['13 Oct 2019', '27 Nov 2019', '22 Jan 2020', ...",['Marielle Heller'],2019,7.2,"['History', 'Drama']",A Beautiful Day in the Neighborhood,0.0,tt3224458,25000000.0,61704055.0,67925733.0,['therapy expensivetom [ACTOR] as [ROLE] break...


# Try some some code for bert generally

In [5]:
# # Load BERT model and tokenizer
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# # Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Sample documents
# documents = df['reviews_extracted'].iloc[0]

# # Tokenize and get BERT embeddings for each document
# def get_bert_embeddings(documents):
#     embeddings = []
#     for doc in documents:
#         inputs = tokenizer(doc, return_tensors='pt', truncation=True, padding=True, max_length=512)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         # Take the embeddings of [CLS] token (first token)
#         doc_embedding = outputs.last_hidden_state[0][0].numpy()
#         embeddings.append(doc_embedding)
#     return np.array(embeddings)

# # Obtain embeddings
# embeddings = get_bert_embeddings(documents)

# # Perform K-Means clustering
# num_topics = 2  # Define the number of topics
# kmeans = KMeans(n_clusters=num_topics)
# kmeans.fit(embeddings)

# # Print the cluster assignments
# for i, label in enumerate(kmeans.labels_):
#     print(f"Document {i}: Cluster {label}")

# # Cluster centers (topics)
# print("Cluster Centers (Topics):")
# print(kmeans.cluster_centers_)

In [6]:
df.sort_values(by='letterboxd_rating', ascending=False, inplace=True)
df

Unnamed: 0,letterboxd_search,url,actors,roles,studio,reviews_extracted,review_stars,review_dates,letterboxd_directors,letterboxd_year,letterboxd_rating,letterboxd_genres,NameIMDB,IMDB_rating,IMDB_ID,letter_USD_Budget,letter_US_Gross,letter_WW_Gross,NER_cleaned_reviews
3172,https://letterboxd.com/search/Harakiri+1962/,https://letterboxd.com/film/harakiri/,"['Tatsuya Nakadai', 'Akira Ishihama', 'Shima I...","['Hanshiro Tsugumo', 'Motome Chijiiwa', 'Miho ...",['Shochiku'],['I am an idiot.Why is it that I still dread w...,"[10, 10, <NA>, 10, 10, 10, <NA>, 9, 10, 10, 9,...","['01 Dec 2013', '', '26 Jan 2024', '21 Jun 202...",['Masaki Kobayashi'],1962,9.38,"['History', 'Drama', 'Action']",Harakiri,8.7,tt0056058,,,15222.0,['I am an idiot Why is it that I still dread w...
6934,https://letterboxd.com/search/Stop+Making+Sens...,https://letterboxd.com/film/stop-making-sense/,"['David Byrne', 'Chris Frantz', 'Jerry Harriso...","['Self - Vocals, Guitar', 'Self - Drums, Vocal...","['Talking Heads Films', 'Arnold Stiefel Company']",['SAME AS IT EVER WASSAME AS IT EVER WASSAME A...,"[10, 10, 10, 10, 10, 10, 10, 10, 9, 10, 10, 10...","['27 Mar 2020', '18 Sep 2023', '06 Aug 2013', ...",['Jonathan Demme'],1984,9.36,"['Music', 'Documentary']",Stop Making Sense,8.5,tt0088178,1200000.0,10306902.0,12471520.0,['SAME AS IT EVER WASSAME AS IT EVER WASSAME A...
1579,https://letterboxd.com/search/Come+and+See+1985/,https://letterboxd.com/film/come-and-see/,"['Aleksei Kravchenko', 'Olga Mironova', 'Liubo...","['Flyora Gayshun', 'Glasha', 'Kosach', 'Rubezh...","['Belarusfilm', 'Mosfilm']",['as soon as this film ended i went online and...,"[10, 10, 10, 9, 10, <NA>, 10, 10, 10, 10, 10, ...","['10 Aug 2020', '25 Aug 2013', '09 Jan 2020', ...",['Elem Klimov'],1985,9.26,"['War', 'Drama']",Come and See,8.2,tt0091251,2500000.0,,20929648.0,['as soon as this film ended i went online and...
6413,https://letterboxd.com/search/Seven+Samurai+1954/,https://letterboxd.com/film/seven-samurai/,"['Toshirō Mifune', 'Takashi Shimura', 'Yoshio ...","['Kikuchiyo', 'Kambei Shimada', 'Gorobei Katay...",['TOHO'],"['too many sweaty ass cheeks, 5 stars', 'this ...","[10, 9, 10, 10, 8, 10, 10, 9, 10, 10, 10, 10, ...","['23 Apr 2020', '09 May 2020', '', '01 Mar 201...",['Akira Kurosawa'],1954,9.22,"['Drama', 'Action']",Seven Samurai,8.7,tt0047478,580000.0,,,"['too many sweaty ass cheeks, 5 stars', 'this ..."
8048,https://letterboxd.com/search/The+Godfather:+P...,https://letterboxd.com/film/the-godfather-part...,"['Al Pacino', 'Robert Duvall', 'Diane Keaton',...","['Don Michael Corleone', 'Tom Hagen', 'Kay Cor...","['Paramount Pictures', 'The Coppola Company', ...","[""young, totally fuckable al pacino and robert...","[10, 10, 10, <NA>, 10, 8, 9, 8, 10, 10, 8, 7, ...","['27 Oct 2019', '06 Oct 2017', '12 Jul 2019', ...",['Francis Ford Coppola'],1974,9.18,"['Crime', 'Drama']",The Godfather: Part II,9.0,tt0071562,13000000.0,47834595.0,47962897.0,"['young, totally fuckable [ACTOR] and [ACTOR] ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10850,https://letterboxd.com/search/Angel+and+Big+Jo...,https://letterboxd.com/film/angel-and-big-joe//,[],[],[],"['The devil is a very big angel, but a very li...","[5, 6, 7, 5, 6, 5, 5, 6, 6, <NA>, <NA>, 6]","['25 Jul 2023', '20 Jun 2024', '01 May 2020', ...",['Bert Salzman'],1975,,['Drama'],Angel and Big Joe,6.8,tt0072646,,,,"['The devil is a very big angel, but a very li..."
10863,https://letterboxd.com/search/Bakuryuu+Sentai+...,https://letterboxd.com/film/bakuryu-sentai-aba...,"['Kouichiro Nishi', 'Shou Tomita', 'Aiko Ito',...","['Ryouga Hakua / Aba Red', 'Yukito Sanjou / Ab...",['Toei Company'],['A bikini monster that shoots a beam that mak...,"[6, <NA>, 5, 6, 6, 6, 7, 2, 10, 5, 7, 5, 7]","['22 Nov 2022', '08 Sep 2022', '07 Mar 2018', ...",['Satoshi Morota'],2003,,"['Action', 'Science Fiction', 'Fantasy', 'Adve...",Bakuryuu Sentai Abaranger Deluxe: Abare Summer...,6.8,tt0860893,,,,['A bikini monster that shoots a beam that mak...
10869,https://letterboxd.com/search/Black+Rage+1988/,https://letterboxd.com/film/catch-the-black-su...,"['Ted Cassidy', 'Chris Robinson', 'Chauncey Lo...","['Striker', 'Sunshine', 'Levi (As Anthony Scot...",['Camelot Entertainment Group'],"['In 1859, a pair of enslaved brothers, Sunshi...","[2, 1, 1, 2, 1, 6, 1, 2, 4, 5, 1, 1, 3, 1, 1, ...","['19 Feb 2023', '08 Mar 2018', '09 Feb 2023', ...",['Chris Robinson'],1972,,"['Action', 'Drama']",Black Rage,0.0,tt0180546,,,,"['In 1859, a pair of enslaved brothers, Sunshi..."
10870,https://letterboxd.com/search/Blood+of+Beasts+...,https://letterboxd.com/film/beauty-and-the-bea...,"['Jane March', 'William Gregory Lee', 'Justin ...","['Freya', 'Sven', 'Eric', 'Beast / Agnar', 'In...","['Crimson Knight', 'Peakviewing Transatlantic']",[],[],[],['David Lister'],2005,,"['Thriller', 'Drama', 'Adventure', 'Fantasy', ...",Blood of Beasts,4.3,tt0338769,,,,[]


In [None]:
import os
import time
import numpy as np
from multiprocessing import Pool
from sentence_transformers import SentenceTransformer
from itertools import chain

documents = list(chain(*df['NER_cleaned_reviews'].iloc[:2000]))


# Function to encode a batch of documents
def process_batch(batch):
    return embedding_model.encode(batch, show_progress_bar=True)

# Function to test parallel processing with different numbers of processes
def test_parallel_processing(documents, batch_size, num_processes_list):
    # Split documents into batches
    batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]
    
    # Record performance for each process count
    performance = {}

    for num_processes in num_processes_list:
        print(f"Trying {num_processes} at {time.time()}")
        start_time = time.time()

        # Run the multiprocessing pool to calculate embeddings
        with Pool(processes=num_processes) as pool:
            embeddings = pool.map(process_batch, batches)

        # Record the time taken for the current number of processes
        end_time = time.time()
        performance[num_processes] = end_time - start_time
        print(f"Time with {num_processes} processes: {end_time - start_time} seconds")

    # Find the optimal number of processes (the one with the shortest time)
    optimal_num_processes = min(performance, key=performance.get)
    print(f"Optimal number of processes: {optimal_num_processes}")

    return optimal_num_processes


# Main function to calculate embeddings
def calculate_embeddings(documents):
    # Initialize SentenceTransformer model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    global embedding_model
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

    # Set batch size for splitting documents
    batch_size = 15  # You can adjust this based on the size of your documents

    # Get number of CPU cores for reference
    num_cores = os.cpu_count()
    print(f"Number of CPU cores: {num_cores}")

    # Test different process counts (1, 2, 4, 8 for example)
    num_processes_list = [4, 8, 16]  # Adjust based on your machine's capabilities

    optimal_num_processes = test_parallel_processing(documents, batch_size, num_processes_list)

    # Split documents into batches again for the optimal number of processes
    batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]

    # Use the optimal number of processes to calculate embeddings
    with Pool(processes=optimal_num_processes) as pool:
        embeddings = pool.map(process_batch, batches)

    # Concatenate embeddings from all batches into a single array
    embeddings = np.concatenate(embeddings, axis=0)

    return embeddings


# Call the main function to calculate embeddings
embeddings = calculate_embeddings(documents)
print(f"Calculated embeddings shape: {embeddings.shape}")


Using device: cpu
Number of CPU cores: 16
Trying 4 at 1732932788.2875211


In [None]:
--break

# BerTopic gives more inutitive topics

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN


# sort to get the top reviews
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# chain from itertools to get the reviews into a bigger list
documents = list(chain(*df['NER_cleaned_reviews'].iloc[:2000]))

# init bert and fit the model on the docs
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
vectorizer_model = CountVectorizer(stop_words='english', max_features=1000)
umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', n_jobs=-1)  # n_jobs=-1 uses all cores
hdbscan_model = HDBSCAN(min_samples=10, n_jobs=-1)  # n_jobs=-1 uses all cores

# Initialize BERTopic with custom UMAP and HDBSCAN models
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
)

# Fit the topic model
topics, probabilities = topic_model.fit_transform(documents)

#Get the topic information and 
df_topics = topic_model.get_topic_info()
df_topics.head()



Using device: cpu


2024-11-29 20:44:18,885 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1785248 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import pickle

# Save the topic model
with open(f'{root}/Data/Models/bertopic_model.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

# Optionally, save the embedding model and vectorizer
with open(f'{root}/Data/Models/embedding_model.pkl', 'wb') as f:
    pickle.dump(embedding_model, f)

with open(f'{root}/Data/Models/vectorizer_model.pkl', 'wb') as f:
    pickle.dump(vectorizer_model, f)


In [None]:
-- break

Now turn it into a function


In [None]:
def topic_model(documents):

    # init bert and fit the model on the docs
    vectorizer_model = CountVectorizer(stop_words='english', max_features=1000)
    topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True)
    __, __ = topic_model.fit_transform(documents)

    df_topics = topic_model.get_topic_info()
    return df_topics

In [None]:
df_topics = topic_model(list(chain(*df['reviews_extracted'].iloc[:800])))
df_topics.head()

2024-11-23 12:31:56,035 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2396/2396 [14:23<00:00,  2.77it/s]
2024-11-23 12:46:22,708 - BERTopic - Embedding - Completed ✓
2024-11-23 12:46:22,708 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-23 12:46:42,747 - BERTopic - Dimensionality - Completed ✓
2024-11-23 12:46:42,749 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-23 12:46:49,023 - BERTopic - Cluster - Completed ✓
2024-11-23 12:46:49,045 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-23 12:46:53,278 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28516,-1_peak_just_love_movie,"[peak, just, love, movie, im, like, good, watc...","[Taking place in Allied-occupied Germany, Ross..."
1,0,877,0_um_uma_filme_com,"[um, uma, filme, com, em, os, mais, da, na, mas]","[""O Tango de Satã"" é um filme do Béla Tarr que..."
2,1,792,1_el_en_una_la,"[el, en, una, la, pelcula, es, lo, las, los, ms]",[Una visión cruda e incómoda de la población m...
3,2,561,2_memories_memory_existence_past,"[memories, memory, existence, past, fear, deat...","[Doubt lurks behind every step, and every conc..."
4,3,542,3_youre_hold_know_want,"[youre, hold, know, want, dont, miss, live, lo...","[""You talk of showing things... I don't know, ..."


In [None]:
df_topics.head(15)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28516,-1_peak_just_love_movie,"[peak, just, love, movie, im, like, good, watc...","[Taking place in Allied-occupied Germany, Ross..."
1,0,877,0_um_uma_filme_com,"[um, uma, filme, com, em, os, mais, da, na, mas]","[""O Tango de Satã"" é um filme do Béla Tarr que..."
2,1,792,1_el_en_una_la,"[el, en, una, la, pelcula, es, lo, las, los, ms]",[Una visión cruda e incómoda de la población m...
3,2,561,2_memories_memory_existence_past,"[memories, memory, existence, past, fear, deat...","[Doubt lurks behind every step, and every conc..."
4,3,542,3_youre_hold_know_want,"[youre, hold, know, want, dont, miss, live, lo...","[""You talk of showing things... I don't know, ..."
5,4,540,4_und_der_ich_den,"[und, der, ich, den, zu, ein, ist, das, wie, mit]",[City of Life and Death wird häufig mit Schind...
6,5,302,5_et_le_des_est,"[et, le, des, est, plus, la, mais, en, si, que]",[Beaucoup de longueurs et de répétitions avant...
7,6,255,6_wars_star_trilogy_george,"[wars, star, trilogy, george, episode, origina...",[I would love to see a whole Star Wars movie f...
8,7,250,7_crying_tears_sad_emotions,"[crying, tears, sad, emotions, started, heartb...",[I'm crying after laughing so hard. WHAT A PER...
9,8,247,8_soundtrack_musical_music_concert,"[soundtrack, musical, music, concert, song, so...","[midsommar (2019) if it was a horny musical, T..."


Works okay, but topics are largely jus specific movies; we want NER to make it more robust.

# Named Entity Recognition

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = " Straight up bologna cake. Even Gmork from The Neverending Story in the 80s was a more believable and realistic wolf."

ner_results = nlp(example)
print(ner_results)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9953448, 'index': 9, 'word': 'G', 'start': 32, 'end': 33}, {'entity': 'I-PER', 'score': 0.8693475, 'index': 10, 'word': '##mor', 'start': 33, 'end': 36}, {'entity': 'I-PER', 'score': 0.82671, 'index': 11, 'word': '##k', 'start': 36, 'end': 37}, {'entity': 'B-MISC', 'score': 0.9859154, 'index': 13, 'word': 'The', 'start': 43, 'end': 46}, {'entity': 'I-MISC', 'score': 0.9771259, 'index': 14, 'word': 'Never', 'start': 47, 'end': 52}, {'entity': 'I-MISC', 'score': 0.9806553, 'index': 15, 'word': '##ending', 'start': 52, 'end': 58}, {'entity': 'I-MISC', 'score': 0.9830625, 'index': 16, 'word': 'Story', 'start': 59, 'end': 64}]


In [None]:
def replace_named_entities_with_bert(text, nlp_ner):
    # Use BERT NER pipeline to extract named entities
    entities = nlp_ner(text)
    # Replace each recognized entity with a placeholder
    # Sort entities by their position (from end to start)
    for entity in sorted(entities, key=lambda x: x['start'], reverse=True):
        text = text[:entity['start']] + f"[{entity['entity']}]"+ text[entity['end']:]

    text = re.sub(r'\[B-([A-Z]+)\](\s*\[I-\1\])+', r'[\1]', text)
    text = re.sub(r'\[[BI]-(\w+)\]', r'[\1]', text)


    # text = re.sub(r'\[B-|I-](PER|ORG|LOC|MISC)\]', r'[\1]', text)
    # print(text)
    return text

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Load the pre-trained NER model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Sample dataframe structure (make sure your dataframe is set correctly)
try:
    documents = df.loc[df['Title'] == 'Alien', 'reviews_extracted']
    documents = list(chain.from_iterable(documents))
    
    # Apply entity replacement on each document
    masked_documents = [replace_named_entities_with_bert(doc, nlp) for doc in documents]

    # Print the original and masked documents
    for mdoc, doc in zip(masked_documents, documents):
        print(doc, '\n', mdoc, '\n\n')
except IndexError:
    print("No documents found with the title 'Alien'. Please check the dataframe.")


Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


I'm starting to feel like the Weyland-Yutani Corporation does not have our best interests at heart. 
 I'm starting to feel like the [ORG] does not have our best interests at heart. 


every time they bullied andy i wanted to punch a hole thru the fucking screen 
 every time they bullied andy i wanted to punch a hole thru the fucking screen 


I can just feel the people who are getting ready to complain about "fan service", but I'm a fan and I wasfucking SERVICED! 
 I can just feel the people who are getting ready to complain about "fan service", but I'm a fan and I wasfucking SERVICED! 


things not to bring into space• pregnant women• british people 
 things not to bring into space• pregnant women• british people 


Priscilla Presley goes to space, only this time, the Xenomorphs are so much meaner than Elvis. 
 [PER][PER] goes to space, only this time, the [MISC] are so much meaner than [PER]. 


Uncanny CGI recreation of the dead is arguablyscarier than any xenomorph 
 Uncanny [MISC]

Run the above on the dataframe as a whole

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

df_small = df.sort_values(by='avg_rating', ascending=False).iloc[:50].copy()
df_small['NER_reviews'] = df_small['reviews_extracted'].apply(
    lambda x: [replace_named_entities_with_bert(doc, nlp) for doc in x]
)


Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df_small['NER_reviews'].iloc[1]

['SAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WASSAME AS IT EVER WAS',
 'sweat my ass off dancing from “slippery people” on in a big suit at a vidiots screening with the band (inc. [PER]nah holt, [PER]ynn mabry, and steve scales) in attendance. [PER]ynn and ednah joined us and did laps around the audience. [PER]a weymouth gave me a near empty glass of champagne and said my suit was nice. i took two gummies and drank a grapefruit radler. vidiots cofounder saw me hitting the choreo and asked “how many times have you…',
 'A polite man is driven to murder. He becomes a prophet and screams manifestos on love, war, and the increasingly alarming impact of technology and progress. Driven to insanity by his own insights into the human condition, he travels to a river in an attempt to drown himself but instead is baptized and absolved of sin. He dies, crosseyed yet painless.This is the defini

# Now we try NER Topic Modelling

In [None]:
df_topics = topic_model(list(chain(*df['NER_reviews'].iloc[:800])))
df_topics.head()    