In [7]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
from spacy.language import Language
import re


nlp = spacy.load("en_core_web_sm")  # Transformer-based pipeline

# Add an EntityRuler to the pipeline
ruler = nlp.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "ORG", "pattern": "Air EV Lite and Seres Group"},  # Example organization
    {"label": "GPE", "pattern": "California"},  # Example location
    {"label": "GPE", "pattern": "Asian"},  # Example location
    {"label": "GPE", "pattern": "European"},  # Example location
    {"label": "GPE", "pattern": "Cleveland Clinic"},  # Example location
    {"label": "GPE", "pattern": "Mississippi Department of Human Services"},  # Example location
    {"label": "PERSON", "pattern": "Elon Musk"},  # Example person
]
ruler.add_patterns(patterns)

# Load dataset
df = pd.read_csv("test_articles.csv")

# Function to filter out fully capitalized lines
def filter_capitalized_paragraphs(text):
    # Split the text into lines for processing
    lines = text.split("\n")
    
    # Filter out lines that are fully capitalized and have more than 2 words
    filtered_lines = [
        line for line in lines
        if not (line.isupper() and len(line.split()) > 2)
    ]
    
    # Rejoin filtered lines into a single string
    return " ".join(filtered_lines)

# Apply the filter to the dataset
filtered_content = df['Main_content'].astype(str).apply(filter_capitalized_paragraphs)

# Combine the filtered text
text = " ".join(filtered_content)

# Process the text with SpaCy
doc = nlp(text)

# Extract relevant entities
entities = {"PER": [], "ORG": [], "LOC": []}

for ent in doc.ents:
    if ent.label_ == "PERSON":
        entities["PER"].append(ent.text)
    elif ent.label_ == "ORG":
        entities["ORG"].append(ent.text)
    elif ent.label_ == "GPE":
        entities["LOC"].append(ent.text)

# Output the extracted entities
print("Extracted Entities:")
for label, entities in entities.items():
    print(f"{label}:")
    count = 0
    for entity in entities:
        count = count + 1
        print(f"  {entity}")
    print(count)

Extracted Entities:
PER:
  Dody Hartono
  Hartono
  Hendra Pratama
  Hendra Budi
  Joe Biden's
  Biden
  Jigar Shah
  Shah
  Darren Woods
  Jason Bordoff
  Bordoff
  Roman Kramarchuk
  Ryan Lance
  Jesse Jenkins
  Biden
  Biden
  Biden
  Thomas Wackman
  Biden
  Biden
  Biden
  Biden
  Biden
  Biden
  Biden
  Biden
  Debbie Dingell
  Dingell
  Dingell
  Randy Vetter
  Vetter
  Jeff Thorne
  Sharon Nachman
  Wastewater
  Aaron Glatt
  Wastewater
  COVID
  Bernie Sanders
  Sanders
  Jake Tapper
  Sanders
  Pence
  Sanders
  Alyssa Milano
  Twitter
  Elon Musk
  Twitter
  Twitter
  Hitler
  emoji
  Elon Musk
  Twitter
  Pete
  Lulz
  Hitler
  Alyssa Milano
  Twitter
  Donald Trump’s
  Twitter
  Trump
  Vox Dei
  Ron DeSantis
  Biden
  Bakken
  Biden
  Biden
  Stefan Tongur
  Bradley C. Wieferich
  Biden
  Phil Bryant
  Henry Laird
  Mary Margaret White
  Sullivan
  Anna Wolfe
  Brett Favre
  Shad White
  John Davis
  Davis
  Bryant
  Bryant
  White
  William Quin II
  Bryant
  White
  Bry