In [None]:
# We will use spaCy for NER and its pre-trained "en_core_web_lg" model.

In [None]:
!python -m spacy download en_core_web_lg

In [14]:
import spacy

# Load the spaCy pre-trained model
nlp = spacy.load("en_core_web_lg")

In [None]:
# First, we want to make sure that we can read the sample.txt file.

In [16]:
# Open the file named "sample.txt" in read mode with UTF-8 encoding
with open("sample.txt", "r", encoding="utf-8") as file:
    # Read all lines from the file and store them in the 'articles' list
    articles = file.readlines()

# Print the first 5 articles or all articles if there are fewer than 5
for i in range(min(5, len(articles))):
    # Print the article number and the content of the article
    print(f"Article {i + 1}:\n{articles[i]}")

    # Print a separator line between articles for better readability
    print("-----\n")


Article 1:
@fansoniclove Gold the Tenrec

-----

Article 2:
Tokyo-bound Sampson sets Aust rifle record. Shooter Dane Sampson has struck career-best form as he builds towards a third Olympics, setting a national record while winning the 50m rifle event at the South Australia championships. Sampson registered a score of 462 points to claim gold in the three positions event. The performance bettered Sampson's own national record of 460.7 points, which he achieved at last month's Wingfield grand prix. The score was also notably higher than what Italy's Niccolo Campriani (458.8) and Poland's Tomasz Bartnik (460.4) produced to win gold at the 2016 Olympics and 2018 world championships respectively. "It's good to be shooting PBs at this stage. It was a world-class finals score," Sampson said, having previously competed at the 2012 and 2016 Olympics. "You are unlikely to lose many competitions with that score. "I definitely feel that I am getting better and better and I am tracking well for To

In [None]:
# The first five articles were printed successfully. Now we will simply look for the strings "gold" or "silver" and print the number of articles containing these strings. Also, we want to print the articles themselves.

In [17]:
def entity_matcher(article):
    # Process the article using spaCy
    doc = nlp(article)

    # Check if "gold" or "silver" is present in the article
    gold_present = any(token.text.lower() == "gold" for token in doc)
    silver_present = any(token.text.lower() == "silver" for token in doc)

    return gold_present, silver_present

gold_count = 0
silver_count = 0

# Lists to store articles containing "gold" and "silver"
gold_articles = []
silver_articles = []

# Iterate through each article in the 'articles' list
for article in articles:
    gold_present, silver_present = entity_matcher(article)

    # Check if "gold" is present in the article
    if gold_present:
        gold_count += 1
        gold_articles.append(article)

    # Check if "silver" is present in the article
    if silver_present:
        silver_count += 1
        silver_articles.append(article)

# Print the number of articles containing 'gold' and 'silver'
print("Number of articles containing 'gold':", gold_count)
print("Number of articles containing 'silver':", silver_count)

# Print the first 5 articles containing "gold"
print("\nFirst 5 articles containing 'gold':")
for i, article in enumerate(gold_articles[:5], start=1):
    print(f"{i}. {article}")

# Print the first 5 articles containing "silver"
print("\nFirst 5 articles containing 'silver':")
for i, article in enumerate(silver_articles[:5], start=1):
    print(f"{i}. {article}")

Number of articles containing 'gold': 602
Number of articles containing 'silver': 99

First 5 articles containing 'gold':
1. @fansoniclove Gold the Tenrec

2. Tokyo-bound Sampson sets Aust rifle record. Shooter Dane Sampson has struck career-best form as he builds towards a third Olympics, setting a national record while winning the 50m rifle event at the South Australia championships. Sampson registered a score of 462 points to claim gold in the three positions event. The performance bettered Sampson's own national record of 460.7 points, which he achieved at last month's Wingfield grand prix. The score was also notably higher than what Italy's Niccolo Campriani (458.8) and Poland's Tomasz Bartnik (460.4) produced to win gold at the 2016 Olympics and 2018 world championships respectively. "It's good to be shooting PBs at this stage. It was a world-class finals score," Sampson said, having previously competed at the 2012 and 2016 Olympics. "You are unlikely to lose many competitions wi

In [None]:
# The problem is that simply checking for the string "gold" or "silver" also counts these strings when they are mentioned in a non-financial context (e.g., gold medal). This is called false positives. spaCy recognizes an article as finance-related, when in reality it's not. It's, for example, sport-related. But this happened because of the simplistic approach we have chosen. Simply looking for the strings produces a lot of false positives.

In [None]:
# We need to do NER, not checking for the strings. We will use spaCy for NER.

In [18]:
# Define a function for matching entities in an article
def entity_matcher(article):
    # Process the article using spaCy
    doc = nlp(article)

    # Check if "gold" or "silver" is present as a recognized entity with the label "COMMODITY"
    gold_present = any(token.ent_type_ == "COMMODITY" and token.text.lower() == "gold" for token in doc)
    silver_present = any(token.ent_type_ == "COMMODITY" and token.text.lower() == "silver" for token in doc)

    return gold_present, silver_present

# Initialize counters for the number of articles containing 'gold' and 'silver' as commodities
gold_count = 0
silver_count = 0

# Loop through each article in the list of articles
for article in articles:
    # Call the entity_matcher function to check for 'gold' and 'silver' in the current article
    gold_present, silver_present = entity_matcher(article)

    # Increment the counters if 'gold' or 'silver' is present in the current article
    if gold_present:
        gold_count += 1
    if silver_present:
        silver_count += 1

# Display the results
print("Number of articles containing 'gold' as a commodity:", gold_count)
print("Number of articles containing 'silver' as a commodity:", silver_count)

Number of articles containing 'gold' as a commodity: 0
Number of articles containing 'silver' as a commodity: 0


In [None]:
# Something is wrong because spaCy didn't find any articles containing "gold" or "silver" as a commodity.

In [None]:
# Let's print all entities that spaCy finds in articles.

In [20]:
def print_entities(article):
    # Process the article using spaCy
    doc = nlp(article)

    # Print all recognized entities
    entities = [f"{ent.text} ({ent.label_})" for ent in doc.ents]
    print(f"Entities in the article: {', '.join(entities)}")

# Iterate through the first 3 articles and print entities
for i, article in enumerate(articles[:3], start=1):
    # Print header for each article
    print(f"\nEntities in Article {i}:\n")
    
    # Call the function to print entities for the current article
    print_entities(article)
    
    # Print separator between articles
    print("-----\n")


Entities in Article 1:

Entities in the article: Tenrec (GPE)
-----


Entities in Article 2:

Entities in the article: Tokyo (GPE), Sampson (PERSON), Aust (GPE), Shooter Dane Sampson (ORG), third (ORDINAL), Olympics (EVENT), 50m (QUANTITY), South Australia (GPE), Sampson (PERSON), 462 (CARDINAL), three (CARDINAL), Sampson (ORG), 460.7 (CARDINAL), last month's (DATE), Wingfield (PERSON), Italy (GPE), Niccolo Campriani (PERSON), 458.8 (CARDINAL), Poland (GPE), Tomasz Bartnik (PERSON), 460.4 (CARDINAL), 2016 (DATE), Olympics (EVENT), 2018 (DATE), Sampson (PERSON), 2012 (DATE), 2016 (DATE), Olympics (EVENT), Tokyo (GPE), Sampson (PERSON), Australia (GPE), 2021 (DATE), Olympics (EVENT), pre-Games (EVENT), July 4-19 (DATE), Tokyo (GPE)
-----


Entities in Article 3:

Entities in the article: 33 (CARDINAL), 688 (CARDINAL), MCB (ORG), a National Park (FAC), One (CARDINAL)
-----



In [None]:
# It looks like spaCy doesn't have the label "COMMODITY" or anything similar defined by default. There were multiple organizations, products, people, etc. found, but not commodities.

In [None]:
# We can run the following code to get all the labels spaCy has defined by default.

In [21]:
# Retrieve the list of named entity labels used by the NER pipeline in the spaCy processing pipeline
nlp.get_pipe('ner').labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [None]:
# We can confirm that there's no "COMMODITY" label or anything similar.

In [None]:
# Let's add the "COMMODITY" label to spaCy and test a simple text to see if spaCy now recognizes gold as an entity under the "COMMODITY" label.

In [23]:
# Import spaCy and the EntityRuler class
import spacy
from spacy.pipeline import EntityRuler

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Create an instance of the EntityRuler
ruler = EntityRuler(nlp)

# Define patterns for the EntityRuler
patterns = [
    {"label": "COMMODITY", "pattern": "gold"}
]

# Add the EntityRuler to the spaCy pipeline
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# Test the sentence
text = "Good time to buy gold?"
doc = nlp(text)

# Print recognized entities
for entity in doc.ents:
    print(entity.text, entity.label_)

gold COMMODITY


In [None]:
# Success! "Gold" was recognized as an entity under the label "COMMODITY" by spaCy. Now we want to extract full articles if, in any sentence inside the article, "gold" or "silver" are found as commodities.

In [28]:
import spacy
from spacy.pipeline import EntityRuler

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Create an instance of the EntityRuler
ruler = EntityRuler(nlp)

# Define patterns for the new entities
patterns = [
    {"label": "COMMODITY", "pattern": "gold"},
    {"label": "COMMODITY", "pattern": "silver"}
]

# Add the EntityRuler to the spaCy pipeline
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# Read articles from file
with open("sample.txt", "r", encoding="utf-8") as file:
    articles = file.readlines()

# Counter for the number of articles containing "COMMODITY"
commodity_article_count = 0

# Print the first 4 articles containing the entity "COMMODITY"
print("First 4 articles containing 'COMMODITY':")
for i, article in enumerate(articles, start=1):
    doc = nlp(article)
    
    # Extract entities with the label "COMMODITY"
    commodity_entities = [entity.text for entity in doc.ents if entity.label_ == "COMMODITY"]
    
    # Check if there are any COMMODITY entities in the article
    if commodity_entities:
        # Increment the count of articles containing "COMMODITY"
        commodity_article_count += 1

        # Print the actual article number and the content
        print(f"{commodity_article_count}. {article.strip()} \n")

        # Check if 4 articles with "COMMODITY" entities have been printed
        if commodity_article_count == 4:
            break

First 4 articles containing 'COMMODITY':
1. Tokyo-bound Sampson sets Aust rifle record. Shooter Dane Sampson has struck career-best form as he builds towards a third Olympics, setting a national record while winning the 50m rifle event at the South Australia championships. Sampson registered a score of 462 points to claim gold in the three positions event. The performance bettered Sampson's own national record of 460.7 points, which he achieved at last month's Wingfield grand prix. The score was also notably higher than what Italy's Niccolo Campriani (458.8) and Poland's Tomasz Bartnik (460.4) produced to win gold at the 2016 Olympics and 2018 world championships respectively. "It's good to be shooting PBs at this stage. It was a world-class finals score," Sampson said, having previously competed at the 2012 and 2016 Olympics. "You are unlikely to lose many competitions with that score. "I definitely feel that I am getting better and better and I am tracking well for Tokyo." Sampson an

In [None]:
# As we can see, there are a lot of false positives (e.g., "Sampson registered a score of 462 points to claim gold in the three positions event."). This is a sentence where "gold" is not mentioned in a financial context, but spaCy treats this sentence as if it were.

In [None]:
# Let's try to define patterns for a new entity called "COMMODITY" in more detail. We will add two more patterns using "IN". It stands for "inclusion". It is used as part of the pattern to specify that the token's lowercase text should be in a list of possible values. This pattern is looking for a lowercase token where the text is either "price", "market", or "commodity". It allows flexibility to match any of the specified words. The "IN" operator is used to define this set of possible values for the token. 

In [59]:
import spacy
from spacy.pipeline import EntityRuler

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Create EntityRuler instance
ruler = EntityRuler(nlp)

# Define patterns for the new entities
patterns = [
    {"label": "COMMODITY", "pattern": [{"LOWER": "gold"}]},
    {"label": "COMMODITY", "pattern": [{"LOWER": "silver"}]},
    {"label": "COMMODITY", "pattern": [{"LOWER": "gold"}, {"LOWER": {"IN": ["price", "market", "commodity"]}}]}, # Added
    {"label": "COMMODITY", "pattern": [{"LOWER": "silver"}, {"LOWER": {"IN": ["price", "market", "commodity"]}}]}, # Added
]

# Add patterns to the EntityRuler
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# Read articles from file
with open("sample.txt", "r", encoding="utf-8") as file:
    articles = file.readlines()

# Initialize counter
article_count = 0

# Iterate through all articles
for i, article in enumerate(articles, start=1):
    doc = nlp(article)
    
    # Iterate through sentences in the article and print those containing "COMMODITY"
    for sent in doc.sents:
        sent_entities = [entity.text for entity in sent.ents if entity.label_ == "COMMODITY"]
        
        if sent_entities:
            article_count += 1
            print(f"Article {i}, Sentence {sent.start}:")
            print(sent.text)
            print("-----\n")

# Print the total number of articles found
print(f"Articles found: {article_count}")

Article 1, Sentence 0:
@fansoniclove Gold the Tenrec

-----

Article 2, Sentence 43:
Sampson registered a score of 462 points to claim gold in the three positions event.
-----

Article 2, Sentence 82:
The score was also notably higher than what Italy's Niccolo Campriani (458.8) and Poland's Tomasz Bartnik (460.4) produced to win gold at the 2016 Olympics and 2018 world championships respectively.
-----

Article 3, Sentence 73:
One of those tenements from my reckoning hosts a copper gold resource another has a copper prospect.
-----

Article 4, Sentence 6:
Hey guys what is gold's history in May?  

-----

Article 5, Sentence 0:
Good time to invest in gold?"

-----

Article 6, Sentence 0:
Gold edges lower as firmer US yields pinch appeal https://t.co/P0HPkmXvBZ

-----

Article 7, Sentence 354:
In both transactions, there was a critical decision about the heritage of the Ghanaian young person; because this gold resource, which is a Ghanaian resource was being treated in a certain way, was

In [None]:
# Until this point, we haven't handled false negatives. This happens when spaCy treats a sentence as if it's not finance-related, when in reality it is. In other words, there are several other aliases that can be used to refer to gold or silver. For example, the XAU/USD alias will almost certainly refer to the gold price per troy ounce in US dollars, and it is commonly used when talking about price events. So, let's add four more patterns to handle false negatives.

In [60]:
import spacy
from spacy.pipeline import EntityRuler

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Create EntityRuler instance
ruler = EntityRuler(nlp)

# Define patterns for the new entities
patterns = [
    {"label": "COMMODITY", "pattern": [{"LOWER": "gold"}]},
    {"label": "COMMODITY", "pattern": [{"LOWER": "silver"}]},
    {"label": "COMMODITY", "pattern": [{"LOWER": "gold"}, {"LOWER": {"IN": ["price", "market", "commodity"]}}]},
    {"label": "COMMODITY", "pattern": [{"LOWER": "silver"}, {"LOWER": {"IN": ["price", "market", "commodity"]}}]},
    {"label": "COMMODITY", "pattern": "XAU/USD"}, # Added
    {"label": "COMMODITY", "pattern": "XAUUSD"}, # Added
    {"label": "COMMODITY", "pattern": "XAG/USD"}, # Added
    {"label": "COMMODITY", "pattern": "XAGUSD"}, # Added
]

# Add patterns to the EntityRuler
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# Read articles from file
with open("sample.txt", "r", encoding="utf-8") as file:
    articles = file.readlines()

# Initialize counter
article_count = 0

# Iterate through all articles
for i, article in enumerate(articles, start=1):
    doc = nlp(article)
    
    # Iterate through sentences in the article and print those containing "COMMODITY"
    for sent in doc.sents:
        sent_entities = [entity.text for entity in sent.ents if entity.label_ == "COMMODITY"]
        
        if sent_entities:
            article_count += 1
            print(f"Article {i}, Sentence {sent.start}:")
            print(sent.text)
            print("-----\n")

# Print the total number of articles found
print(f"Articles found: {article_count}")

Article 1, Sentence 0:
@fansoniclove Gold the Tenrec

-----

Article 2, Sentence 43:
Sampson registered a score of 462 points to claim gold in the three positions event.
-----

Article 2, Sentence 82:
The score was also notably higher than what Italy's Niccolo Campriani (458.8) and Poland's Tomasz Bartnik (460.4) produced to win gold at the 2016 Olympics and 2018 world championships respectively.
-----

Article 3, Sentence 73:
One of those tenements from my reckoning hosts a copper gold resource another has a copper prospect.
-----

Article 4, Sentence 6:
Hey guys what is gold's history in May?  

-----

Article 5, Sentence 0:
Good time to invest in gold?"

-----

Article 6, Sentence 0:
Gold edges lower as firmer US yields pinch appeal https://t.co/P0HPkmXvBZ

-----

Article 7, Sentence 354:
In both transactions, there was a critical decision about the heritage of the Ghanaian young person; because this gold resource, which is a Ghanaian resource was being treated in a certain way, was

In [None]:
# Looks like there were three false negatives that we didn't handle before (before: 562 articles found, now: 565 articles found). There were 3 articles mentioning "XAUUSD"/"XAU/USD" or "XAGUSD"/"XAG/USD", but not "gold" or "silver". They were left out before, but now spaCy has added them because we defined these patterns under the "COMMODITY" label.

In [None]:
# But still, looking at the sentences printed above, we still see many false positives. For example, "Sampson registered a score of 462 points to claim gold in the three positions event." is printed as if gold is mentioned in a finance-related context. It's not. Let's try to remove the first two patterns because these two seem to catch a lot of false positives. If spaCy simply looks for the entities "gold" and "silver", these two are used in a lot of contexts, not just in a financial context.

In [61]:
import spacy
from spacy.pipeline import EntityRuler

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Create EntityRuler instance
ruler = EntityRuler(nlp)

# Define patterns for the new entities
patterns = [
    # {"label": "COMMODITY", "pattern": [{"LOWER": "gold"}]}, # Removed
    # {"label": "COMMODITY", "pattern": [{"LOWER": "silver"}]}, #Removed
    {"label": "COMMODITY", "pattern": [{"LOWER": "gold"}, {"LOWER": {"IN": ["price", "market", "commodity", "trading", "investment", "bullion", "precious metal"]}}]},
    {"label": "COMMODITY", "pattern": [{"LOWER": "silver"}, {"LOWER": {"IN": ["price", "market", "commodity", "trading", "investment", "bullion", "precious metal"]}}]},
    {"label": "COMMODITY", "pattern": "XAU/USD"},
    {"label": "COMMODITY", "pattern": "XAUUSD"},
    {"label": "COMMODITY", "pattern": "XAG/USD"},
    {"label": "COMMODITY", "pattern": "XAGUSD"},
]

# Add patterns to the EntityRuler
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# Read articles from file
with open("sample.txt", "r", encoding="utf-8") as file:
    articles = file.readlines()

# Initialize counter
article_count = 0

# Iterate through all articles
for i, article in enumerate(articles, start=1):
    doc = nlp(article)
    
    # Iterate through sentences in the article and print those containing "COMMODITY"
    for sent in doc.sents:
        sent_entities = [entity.text for entity in sent.ents if entity.label_ == "COMMODITY"]
        
        if sent_entities:
            article_count += 1
            print(f"Article {i}, Sentence {sent.start}:")
            print(sent.text)
            print("-----\n")

# Print the total number of articles found
print(f"Articles found: {article_count}")

Article 17, Sentence 0:
Gold Price News and Forecast: XAU/USD is trapped in daily support and resistance.
-----

Article 49, Sentence 0:
Gold price trading in neutral territory following rising inflation, U.S. personal income grows 21.1% in March… https://t.co/bvJEdowf6g

-----

Article 50, Sentence 0:
Gold Price Today April 30: Gold Rate Continues To Fall, Check City-Wise Price List https://t.co/HGRF8zgSS9

-----

Article 127, Sentence 0:
Gold | $XAUUSD | $XAU $USD  
-----

Article 142, Sentence 449:
Gold bullion volumes increased more than 100% year over year and were 982.8 thousand ounces (2019 – 483.0 thousand ounces) while silver bullion volumes were 29.5 million ounces (2019 – 22.8 million ounces).
-----

Article 180, Sentence 663:
”The strategy has seen DGO Gold shares soar from just 17c four years ago, and $1.70 last year when Mr Parncutt became an executive, to close on Friday at $3.63.They soared over $4 in September last year on the back of the coronavirus-fuelled surge in t

In [None]:
# It looks like a lot of false positives have been eliminated now. But looking at sample.txt we can see that now we have a LOT of false negatives. There are more than 23 articles in sample.txt talking about "gold" or "silver" in a financial context, but spaCy only found 23! So let's try to add more finance-related words to the patterns. The goal is for false positives not to occur but to lower false negatives.

In [62]:
import spacy
from spacy.pipeline import EntityRuler

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Create EntityRuler instance
ruler = EntityRuler(nlp)

# Define patterns for the new entities
patterns = [
    {"label": "COMMODITY", "pattern": [{"LOWER": {"IN": ["gold"]}}, {"LOWER": {"IN": ["price", "market", "commodity", "trading", "investment", "bullion", "precious metal", "futures", "yield", "contract", "reserve currencies", "modern monetary theory"]}}]}, # Add more finance-related words to the patterns
    {"label": "COMMODITY", "pattern": [{"LOWER": {"IN": ["silver"]}}, {"LOWER": {"IN": ["price", "market", "commodity", "trading", "investment", "bullion", "precious metal", "futures", "yield", "contract", "reserve currencies", "modern monetary theory"]}}]}, # Add more finance-related words to the patterns
    {"label": "COMMODITY", "pattern": "XAU/USD"},
    {"label": "COMMODITY", "pattern": "XAUUSD"},
    {"label": "COMMODITY", "pattern": "XAG/USD"},
    {"label": "COMMODITY", "pattern": "XAGUSD"},
]

# Add patterns to the EntityRuler
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)

# Read articles from file
with open("sample.txt", "r", encoding="utf-8") as file:
    articles = file.readlines()

# Initialize counter
article_count = 0

# Iterate through all articles
for i, article in enumerate(articles, start=1):
    doc = nlp(article)
    
    # Iterate through sentences in the article and print those containing "COMMODITY"
    for sent in doc.sents:
        sent_entities = [entity.text for entity in sent.ents if entity.label_ == "COMMODITY"]
        
        if sent_entities:
            article_count += 1
            print(f"Article {i}, Sentence {sent.start}:")
            print(sent.text)
            print("-----\n")

# Print the total number of articles found
print(f"Articles found: {article_count}")

Article 17, Sentence 0:
Gold Price News and Forecast: XAU/USD is trapped in daily support and resistance.
-----

Article 49, Sentence 0:
Gold price trading in neutral territory following rising inflation, U.S. personal income grows 21.1% in March… https://t.co/bvJEdowf6g

-----

Article 50, Sentence 0:
Gold Price Today April 30: Gold Rate Continues To Fall, Check City-Wise Price List https://t.co/HGRF8zgSS9

-----

Article 108, Sentence 0:
Gold futures eke out gain on volatile day, silver near Rs 67,550/kg: What analysts say - Domestic gold prices seesa… https://t.co/w6qkrpNcOA

-----

Article 127, Sentence 0:
Gold | $XAUUSD | $XAU $USD  
-----

Article 140, Sentence 0:
Silver Futures Discussions.
-----

Article 142, Sentence 449:
Gold bullion volumes increased more than 100% year over year and were 982.8 thousand ounces (2019 – 483.0 thousand ounces) while silver bullion volumes were 29.5 million ounces (2019 – 22.8 million ounces).
-----

Article 180, Sentence 663:
”The strategy has 

In [None]:
# It looks like we were able to eliminate some false negatives (the number of articles rose from 23 to 49) while controlling for the false positives just by adding more words to the patterns.

In [None]:
# Tweaking this approach further (i.e., adding more words to the patterns) would probably yield even better results, meaning even fewer false negatives ceteris paribus (i.e., for the same amount of false positives), but I decided to stop here.