In [17]:
import os

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())
root = root.replace('\\', '/')
print(root)


c:/Users/fitsl/Documents/Programming/UVM Programming Classes/PoCS/pocs_project


In [18]:
import pandas as pd
df = pd.read_parquet(f"{root}/Data/Whole_sets/merged_tropes_comments.parquet")

In [19]:
# download the model we'll use for NER
# !python -m spacy download en_core_web_lg


In [24]:
import spacy

# Load the pretrained spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample reviews
reviews = [
    "I'm starting to feel like the Weyland-Yutani Corporation does not have our best interests at heart.",
    "Every time they bullied Andy I wanted to punch a hole thru the fucking screen.",
    "I can just feel the people who are getting ready to complain about 'fan service', but I'm a fan and I wasfucking SERVICED!",
    "Things not to bring into space• pregnant women• british people.",
    "Priscilla Presley goes to space, only this time, the Xenomorphs are so much meaner than Elvis.",
    "Uncanny CGI recreation of the dead is arguably scarier than any xenomorph.",
    "The parts that are an ALIEN splatter movie are a real blast. The parts that are an ALIEN legacy sequel are mostly fucking stupid.",
    "From now on when I think about God I’m going to envision Cailee Spaeny with a gun.",
    "Stupidly well directed and tense for what is ultimately a pretty standard legacy sequel. Multiple sequences had me gripped and temporarily paralyzed with fear (shout out to the anti-gravity acid scene for being particularly unique) before I remembered 'Oh yeah, I don’t need to speculate about anyone’s survival since I’ve seen the original Alien.' It feels like a crime to waste Fede Álvarez, Cailee Spaeny, and David Jonsson’s immense talents on something so predictable and unambitious, but I’d be lying…",
    "I could be a good mu/th/ur."
]

def custom_ner_mask(text):
    doc = nlp(text)
    masked_text = text
    
    # Mask recognized entities
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            masked_text = masked_text.replace(ent.text, "[PER]")
        elif ent.label_ == "ORG":
            masked_text = masked_text.replace(ent.text, "[ORG]")
        elif ent.label_ == "GPE" or ent.label_ == "LOC":  # GPE includes countries and cities
            masked_text = masked_text.replace(ent.text, "[LOC]")
        else:
            masked_text = masked_text.replace(ent.text, "[MISC]")
    return masked_text

# Process reviews
for review in reviews:
    print(f"Original Review: {review}")
    masked_review = custom_ner_mask(review)
    print(f"NER Masked Review: {masked_review}")
    print("-" * 100)


Original Review: I'm starting to feel like the Weyland-Yutani Corporation does not have our best interests at heart.
NER Masked Review: I'm starting to feel like the Weyland-Yutani Corporation does not have our best interests at heart.
----------------------------------------------------------------------------------------------------
Original Review: Every time they bullied Andy I wanted to punch a hole thru the fucking screen.
NER Masked Review: Every time they bullied [PER] I wanted to punch a hole thru the fucking screen.
----------------------------------------------------------------------------------------------------
Original Review: I can just feel the people who are getting ready to complain about 'fan service', but I'm a fan and I wasfucking SERVICED!
NER Masked Review: I can just feel the people who are getting ready to complain about 'fan service', but I'm a fan and I wasfucking SERVICED!
-------------------------------------------------------------------------------------