In [4]:
import pandas as pd
import spacy
import spacy_entity_linker

# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("emission_rule_news_steven.csv")

# Function to filter out fully capitalized lines
def filter_capitalized_paragraphs(text):
    # Split the text into lines for processing
    lines = text.split("\n")
    # Filter out lines that are fully capitalized and have more than 2 words
    filtered_lines = [
        line for line in lines
        if not (line.isupper() and len(line.split()) > 2)
    ]
    # Rejoin filtered lines into a single string
    return " ".join(filtered_lines)

df['filtered_maintext'] = df['maintext'].astype(str).apply(filter_capitalized_paragraphs)

# Initialize a dictionary to store entities
extracted_entities = []

for index, row in df.iterrows():
    article_id = row['id']  # Get the Article ID
    article_text = row['filtered_maintext']  # Get the filtered text
    publish_date = row['date_publish']
    url = row['news_url']

    # Process the article
    doc = nlp(article_text)

    # collecting the entities in array
    entities = []
    # classifier
    classifier = spacy_entity_linker.EntityClassifier.EntityClassifier()
    for ent in doc.ents:
        if ent.label_ in {"ORG"}:
            # build a term candidate (a simple span)
            termCandidate = spacy_entity_linker.TermCandidate.TermCandidate(ent)
            # get all the candidates for the term
            entityCandidates = termCandidate.get_entity_candidates()
            if len(entityCandidates) > 0:
                # select the best candidate
                entity = classifier(entityCandidates)
                # entity.span.sent._.linkedEntities.append(entity) # --> cannot if the attribute is not registered
                entities.append(entity)
            else:
                entity = None
            # print(f'SpaCy: {(ent.text + " " + ent.label_).ljust(40)}spaCy-entity-linker: {entity}')

            extracted_entities.append({
                "Article ID": article_id,  # Add the Article ID
                "Label": ent.label_,
                "Pattern": ent.text,
                "Entity": entity,
                "Publish Date": publish_date,
                "Article URL": url
            })
            
output_df = pd.DataFrame(extracted_entities)

# Save the DataFrame to a CSV file
output_file_path = "emission_extracted_entities_org_linker.csv"
output_df.to_csv(output_file_path, index=False)

In [5]:
output_df

Unnamed: 0,Article ID,Label,Pattern,Entity,Publish Date,Article URL
0,32,ORG,U.S. Postal Service,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
1,32,ORG,USPS,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
2,32,ORG,SEIS,Seismic Experiment for Interior Structure,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
3,32,ORG,Next Generation Delivery Vehicles,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
4,32,ORG,SEIS,Seismic Experiment for Interior Structure,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
...,...,...,...,...,...,...
59804,91445,ORG,the White House,White House,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...
59805,91445,ORG,Senate,United States Senate,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...
59806,91445,ORG,Administration,receivership,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...
59807,91445,ORG,EPA,United States Environmental Protection Agency,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...


In [6]:
df

Unnamed: 0,id,news_url,date_publish,authors,maintext,preprocessed_maintext,extracted_domain,normalized_domain,domain,bias_score,bias_category,topic_number,topic_keywords,num_topics,filtered_maintext
0,32,https://www.commondreams.org/opinion/usps-elec...,2023-07-26 10:42:02,"annie-norman, www.facebook.com",For as much as we’ve heard Postmaster General ...,much hear Postmaster General Louis DeJoy talk ...,www.commondreams.org,commondreams.org,commondreams.org,-0.6526,Most Left,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,For as much as we’ve heard Postmaster General ...
1,42,https://www.wmur.com/article/the-epas-ambitiou...,2023-08-06 17:09:00,Tom Krisher,The EPA says the industry could meet the limit...,EPA say industry could meet limit new vehicle ...,www.wmur.com,wmur.com,wmur.com,0.1866,Central,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,The EPA says the industry could meet the limit...
2,77,https://whyy.org/articles/delaware-electric-ve...,2023-08-03 22:13:47,Cris Barrish,This story is part of the WHYY News Climate De...,story part WHYY News Climate Desk bring news s...,whyy.org,whyy.org,whyy.org,-0.6174,Most Left,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,This story is part of the WHYY News Climate De...
3,78,https://whyy.org/articles/delaware-legislature...,2023-05-03 01:33:54,Johnny Perez-Gonzalez,This story is part of the WHYY News Climate De...,story part WHYY News Climate Desk bring news s...,whyy.org,whyy.org,whyy.org,-0.6174,Most Left,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,This story is part of the WHYY News Climate De...
4,96,https://www.nbcdfw.com/news/national-internati...,2023-08-06 13:28:37,Tom Krisher,The U.S. government’s most ambitious plan ever...,U S government ambitious plan ever slash plane...,www.nbcdfw.com,nbcdfw.com,nbcdfw.com,0.3781,Lean Right,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,The U.S. government’s most ambitious plan ever...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2921,91221,https://www.clickondetroit.com/business/2023/0...,2023-09-14 04:03:24,"Matthew Daly, Associated Press",WASHINGTON – Efforts by the Biden administrati...,WASHINGTON effort Biden administration limit p...,www.clickondetroit.com,clickondetroit.com,clickondetroit.com,0.0795,Central,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,WASHINGTON – Efforts by the Biden administrati...
2922,91237,https://www.clickondetroit.com/business/2023/1...,2023-11-25 13:42:34,"Alexa St. John, Associated Press",The negative impact on the climate from passen...,negative impact climate passenger vehicle cons...,www.clickondetroit.com,clickondetroit.com,clickondetroit.com,0.0795,Central,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,The negative impact on the climate from passen...
2923,91346,https://www.zerohedge.com/technology/visualizi...,2023-11-22 20:30:00,Tyler Durden,Electric vehicles are a fast growing segment i...,electric vehicle fast grow segment U S much ma...,www.zerohedge.com,zerohedge.com,zerohedge.com,0.6771,Most Right,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,Electric vehicles are a fast growing segment i...
2924,91349,https://www.zerohedge.com/political/biden-admi...,2023-11-24 08:35:00,Tyler Durden,Authored by Brent Bennett & Andrea Hitt via Re...,author Brent Bennett Andrea Hitt via RealClear...,www.zerohedge.com,zerohedge.com,zerohedge.com,0.6771,Most Right,9,"0.040*""vehicle"" + 0.025*""electric"" + 0.022*""em...",35,Authored by Brent Bennett & Andrea Hitt via Re...
