In [2]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler

# Load the pre-trained model imported from spacy
nlp = spacy.load("en_core_web_sm")

# Add an EntityRuler to the pipeline
df_pattern = pd.read_csv("patterns.csv")
patterns = df_pattern.to_dict(orient="records")
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)

# Load dataset
df = pd.read_csv("test_articles_3.csv")

# Function to filter out fully capitalized lines
def filter_capitalized_paragraphs(text):
    # Split the text into lines for processing
    lines = text.split("\n")
    # Filter out lines that are fully capitalized and have more than 2 words
    filtered_lines = [
        line for line in lines
        if not (line.isupper() and len(line.split()) > 2)
    ]
    # Rejoin filtered lines into a single string
    return " ".join(filtered_lines)

df['filtered_maintext'] = df['maintext'].astype(str).apply(filter_capitalized_paragraphs)

# Initialize a dictionary to store entities
extracted_entities = []

for index, row in df.iterrows():
    article_id = row['id']  # Get the Article ID
    article_text = row['filtered_maintext']  # Get the filtered text
    publish_date = row['date_publish']
    url = row['news_url']

    # Process the article
    doc = nlp(article_text)
    # Extract relevant entities
    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG"}:  # Include only relevant labels
            label = "PER" if ent.label_ == "PERSON" else ent.label_
            extracted_entities.append({
                "Article ID": article_id,  # Add the Article ID
                "Label": label,
                "Pattern": ent.text,
                "Publish Date": publish_date,
                "Article URL": url
            })

# Convert the extracted entities to a DataFrame
output_df = pd.DataFrame(extracted_entities)

# Save the DataFrame to a CSV file
output_file_path = "extracted_entities_3.csv"
output_df.to_csv(output_file_path, index=False)

