**Members:** Ethan Wong, Timmy Ren, Mason Shu, Medha Nalamada, Carson Mullen, Bethel Kim

**Morning Cohort**: 11 AM - 1 PM

**Current Tasks Remaining:**

1. Perform Active/Passive/Topic Voice Analysis --> Timmy
2. Use ChatGPT to generate sentiment for actors in a given category (-100 to 100) --> Ethan
3. Implement Bias Score Metric --> Timmy

# Install and Load Libraries

In [47]:
#!pip install eventregistry
!python -m spacy download en_core_web_lg
#!python -m spacy download en_core_web_sm
#from eventregistry import *
import pandas as pd
import json
import re
import spacy # type: ignore

Collecting en-core-web-lg==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


# Scraping Articles with Event Registry API

In [2]:
# Load the API key from the JSON file
with open("config.json", "r") as file:
    config = json.load(file)
api_key = config["api_key"]

# Initialize EventRegistry with the API key
er = EventRegistry(apiKey=api_key, allowUseOfArchive=False)

# Define topics to search for
#topics = [
    #"Donald Trump", "Kamala Harris", "Israel", "Palestine", "Palestinians", "Hamas", "FEMA", "Abortion",
    #"Inflation", "Unemployment", "Economy", "Dockworkers", "ILA Port", "Immigration"
#]

topics = ["Election", "Gaza", "FEMA", "Abortion", "Inflation", "Unemployment", "Dockworkers", "Immigration"]

# Define sources to search for and get their URIs
source_names = ["NPR", "MSNBC", "AP News", "FOX", "Forbes"]
source_uris = {source: er.getNewsSourceUri(source) for source in source_names}

# List to store the names of all generated DataFrames
dataframe_names = []

# Loop through each topic
for topic in topics:
    # Get the URI for the concept
    concept_uri = er.getConceptUri(topic)
    
    # List to hold all articles' data for the current topic (across all sources)
    articles_data = []
    
    # Loop through each source individually
    for source_name, source_uri in source_uris.items():
        # Define the query for each topic and source
        q = QueryArticlesIter(
            conceptUri=concept_uri,
            sourceUri=source_uri,
            sourceLocationUri=er.getLocationUri("United States"),  # Only US sources
        )

        # Fetch and accumulate up to 500 articles for the current topic from this source
        for art in q.execQuery(er, sortBy="date", maxItems=500):
            articles_data.append({
                "title": art.get("title"),
                "source": art.get("source", {}).get("title"),
                "author": art.get("author"),
                "url": art.get("url"),
                "publishedAt": art.get("dateTime"),
                "content": art.get("body")
            })

    # Create a single DataFrame for the current topic with articles from all sources
    articles_df = pd.DataFrame(articles_data)
    
    # Save the DataFrame to a CSV file
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    articles_df.to_csv(file_name, index=False)

    # Dynamically set the DataFrame name based on the topic, replacing spaces with underscores
    df_name = f"{topic.replace(' ', '_')}_df"
    globals()[df_name] = articles_df

    # Append the DataFrame name to the list
    dataframe_names.append(df_name)

# Print the list of all generated DataFrame names
print("Generated DataFrames:", dataframe_names)

Generated DataFrames: ['Election_df', 'Gaza_df', 'FEMA_df', 'Abortion_df', 'Inflation_df', 'Unemployment_df', 'Dockworkers_df', 'Immigration_df']


# Cleaning Dataframes and Running Named Entity Recognition

In [48]:
# Reading in CSVs dynamically

topics = ["Election", "Gaza", "FEMA", "Abortion", "Inflation", "Unemployment", "Dockworkers", "Immigration"]

# Dictionary to hold the DataFrames after reading them from CSV
dataframes = {}

# Loop to read each CSV and store the DataFrame in the dictionary
for topic in topics:
    # Replace spaces with underscores to match your file naming convention
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    try:
        dataframes[topic.replace(' ', '_')] = pd.read_csv(file_name)
    except FileNotFoundError as e:
        print(f"Error: {e}")  # If a file is not found

In [49]:
# Clean the dataframes
def count_sentences(text):
    sentences = re.split(r'[.!?]+', text)
    return len([s for s in sentences if s.strip()])

def clean_df(df):
    df = df.drop_duplicates().copy()
    df.loc[:, 'content'] = df['content'].str.replace(
        "By entering your email and pushing continue, you are agreeing to Fox News\' Terms of Use and Privacy Policy, which includes our Notice of Financial Incentive.\n\n", 
        "", 
        regex=False
    )
    df.loc[:, 'num_sentences'] = df['content'].apply(count_sentences)
    return df

# Apply clean_df to each DataFrame in the dictionary
cleaned_dataframes = {key: clean_df(df) for key, df in dataframes.items()}

In [9]:
# Running name entity recognition on the data
nlp = spacy.load("en_core_web_lg")

# Function to extract named entities from a document
def extract_entities_from_docs(docs):
    entity_lists = []
    for doc in docs:
        entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "LOC"] and ent.text.isalpha() and len(ent.text) > 2]
        entity_lists.append(entities)
    return entity_lists

# Loop through all DataFrames
for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Filter out rows with missing data in 'title' and 'content'
    df = df.dropna(subset=['title', 'content'])

    # nlp.pipe is btch processing
    title_docs = nlp.pipe(df['title'], disable=["textcat"])
    content_docs = nlp.pipe(df['content'], disable=["textcat"])
    
    df['Title_Entities'] = extract_entities_from_docs(title_docs)
    df['Content_Entities'] = extract_entities_from_docs(content_docs)

    # Combine all entity lists into one for counting
    all_entities = df['Title_Entities'].sum() + df['Content_Entities'].sum()
    
    # Calculate the value counts
    entity_counts = pd.Series(all_entities).value_counts()
    
    # Output the top 10 most frequent entities
    print(f"Top entities for {topic}:\n", entity_counts.head(10))

    # Store the counts for further analysis
    cleaned_dataframes[topic]['Entity_Counts'] = entity_counts

Processing topic: Election
Top entities for Election:
 Trump           7078
Harris          4119
Biden           1937
Vance            963
GOP              826
Georgia          787
Congress         659
Walz             654
Pennsylvania     638
Israel           618
Name: count, dtype: int64
Processing topic: Gaza
Top entities for Gaza:
 Israel       3739
Hezbollah    1708
Gaza         1339
Lebanon      1300
Hamas        1143
Iran          875
Biden         510
Harris        459
Trump         411
Beirut        288
Name: count, dtype: int64
Processing topic: FEMA
Top entities for FEMA:
 Trump         1237
Harris        1180
Georgia        357
Biden          355
California     294
Combs          293
Texas          282
Israel         274
America        253
Florida        234
Name: count, dtype: int64
Processing topic: Abortion
Top entities for Abortion:
 Trump       2572
Harris      1889
Vance        610
Biden        537
Walz         387
GOP          315
Georgia      295
Senate       258
Ro

Based on the data, we determined that these were the relevant actors for each topic:

* **Election:** Trump, Harris, Biden
* **Gaza:** Israel, Hamas, Palestinians
* **FEMA:** Trump, Harris
* **Abortion**: Trump, Harris, Women
* **Unemployment**: Fed, Trump, Harris 
* **Dockworkers**: Dockworkers
* **Immigration**: Trump, Harris, Biden, Haitian

In order to process the articles more efficiently for subsequent steps, we decided to remove any sentences that do not contain the above words. Additionally, we will do the following for specific topics:

* **Gaza:** Find and Replace Civilian
* **Abortion**: Check for instances of "Woman" as well
* **Dockworkers**: Find and replace Union, International Longshoremen's Association; Find and replace United States Maritime Alliance, USMX
* **Immigration**: Find and replace Springfield

Our reasoning is that now, the articles will be smaller in size while still retaining all of the articles, and that passive/active voice is not getting affected by non-actor sentences.

In [14]:
# Find and replace for each of the sentences
# For article in a given topic, remove any sentences that do not contain these words. 
# Idea is that articles are smaller and easier to process while still retaining the articles 
# Passive/Active voice is not getting affected by non-actor sentences

In [83]:
# Define the keywords and find-and-replace mappings for each topic
topic_keywords = {
    "Election": {
        "keywords": ["Trump", "Harris", "Biden"],
    },
    "Gaza": {
        "keywords": ["Israel", "Hamas", "Palestinians"],
        "find_replace": {"Palestinians": "civilian"}
    },
    "FEMA": {
        "keywords": ["Trump", "Harris"],
    },
    "Abortion": {
        "keywords": ["Trump", "Harris", "Women", "Woman"],
    },
    "Unemployment": {
        "keywords": ["Fed", "Trump", "Harris"],
    },
    "Dockworkers": {
        "keywords": ["Dockworkers", "Employer"],
        "find_replace": {
            "union": "Dockworkers",
            "international longshoremen's association": "Dockworkers",
            "United States Maritime Alliance": "Employer",
            "USMX": "Employer"
        }
    },
    "Immigration": {
        "keywords": ["Trump", "Harris", "Biden", "Haitian"],
        "find_replace": {"Springfield": "Haitian"}
    }
}

# Function to perform find-and-replace operations
def apply_find_replace(text, find_replace_dict):
    # Build a mapping from lowercased keys to their replacements
    lower_find_replace = {k.lower(): v for k, v in find_replace_dict.items()}
    # Escape special characters in keys for regex
    escaped_keys = [re.escape(k) for k in find_replace_dict.keys()]
    pattern = re.compile("|".join(escaped_keys), re.IGNORECASE)
    
    def replace_match(m):
        matched_text = m.group(0)
        # Lookup the replacement using the lowercase matched text
        replacement = lower_find_replace.get(matched_text.lower(), matched_text)
        return replacement
    
    return pattern.sub(replace_match, text)

# Function to filter sentences based on keywords and ensure a keyword threshold is met
def filter_sentences(text, keywords, threshold=1):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split text into sentences
    keyword_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
    
    filtered_sentences = []
    keyword_count = 0
    
    for sent in sentences:
        if keyword_pattern.search(sent):
            filtered_sentences.append(sent)
            keyword_count += len(keyword_pattern.findall(sent))  # Count occurrences of keywords
    
    # Only return the filtered sentences if the keyword count meets or exceeds the threshold
    if keyword_count >= threshold:
        return ' '.join(filtered_sentences)
    else:
        return ""  # Return empty if the threshold isn't met


In [114]:
# Apply the filtering to each DataFrame
threshold = 2  # Set the threshold for how many keywords need to be found to retain the content

for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    df = df.copy()
    
    # Get keywords and find-and-replace mappings for the topic
    keywords = topic_keywords.get(topic, {}).get('keywords', [])
    find_replace = topic_keywords.get(topic, {}).get('find_replace', {})
    
    # Apply find-and-replace and sentence filtering to 'content' column
    filtered_contents = []
    for content in df['content']:
        # Skip if content is NaN
        if pd.isnull(content):
            filtered_contents.append(content)
            continue
        
        # Apply find-and-replace operations
        if find_replace:
            content = apply_find_replace(content, find_replace)
        
        # Filter sentences based on keywords and threshold
        filtered_content = filter_sentences(content, keywords, threshold)
        filtered_contents.append(filtered_content)
    
    # Update the DataFrame with the filtered content in the "content" column
    df['content'] = filtered_contents
    cleaned_dataframes[topic] = df

    # Save the updated DataFrame to a new CSV file (optional)
    # df.to_csv(f"{topic.replace(' ', '_')}_filtered_articles.csv", index=False)

Processing topic: Election
Processing topic: Gaza
Processing topic: FEMA
Processing topic: Abortion
Processing topic: Inflation
Processing topic: Unemployment
Processing topic: Dockworkers
Processing topic: Immigration


In [115]:
# Create a second version of cleaned_dataframes where articles with empty 'content' are removed
cleaned_dataframes_filtered = {}
for topic, df in cleaned_dataframes.items():
    # Remove rows where 'content' is empty or contains only whitespace
    df_filtered = df[df['content'].str.strip().astype(bool)].copy()
    cleaned_dataframes_filtered[topic] = df_filtered

    # Save the filtered DataFrame to a new CSV file (second version)
    # df_filtered.to_csv(f"{topic.replace(' ', '_')}_filtered_articles_no_empty.csv", index=False)

# Print the number of articles before and after filtering
for topic in topic_keywords.keys():
    original_count = len(cleaned_dataframes[topic])
    filtered_count = len(cleaned_dataframes_filtered[topic])
    print(f"Topic: {topic}, Original Articles: {original_count}, After Filtering: {filtered_count}")


Topic: Election, Original Articles: 1967, After Filtering: 1074
Topic: Gaza, Original Articles: 481, After Filtering: 345
Topic: FEMA, Original Articles: 1429, After Filtering: 235
Topic: Abortion, Original Articles: 539, After Filtering: 385
Topic: Unemployment, Original Articles: 183, After Filtering: 70
Topic: Dockworkers, Original Articles: 2500, After Filtering: 10
Topic: Immigration, Original Articles: 937, After Filtering: 586


In [116]:
# Display the first few rows of the filtered DataFrame for a topic
topic = "Dockworkers"
cleaned_dataframes[topic][['title', 'content', 'filtered_content']].head()

Unnamed: 0,title,content,filtered_content
0,"'Wait Wait' for October 12, 2024: With Not My ...",,
1,What the Harris campaign is doing to try to wi...,,
2,"Hurricane Evacuation Saves Lives, Mass Gatheri...",,
3,Some Democrats are still hesitant to vote for ...,,
4,"Harris releases medical report, drawing anothe...",,


In [119]:
cleaned_dataframes_filtered['Dockworkers']['content']

31      The company agreed in recent talks with the Do...
234     Harris and Trump remain in a close battle, as ...
446     BALTIMORE -- Dockworkers dockworkers along Eas...
617     Dockworkers strike on a picket line outside of...
769     Alex Wagner sat down with Dockworkers workers ...
1283    MOGADISHU, Somalia (AP) -- Somalia says Egypt ...
1525    Accordingly, the Confederacy planned arguably ...
1534    as the double-stacked train was stopped and aw...
1543    20 vote for a new president and on a referendu...
2366    AFL-CIO President Liz Shuler said the partners...
Name: content, dtype: object

In [96]:
cleaned_dataframes_filtered

{'Election':                                                   title  source  author  \
 0     What the Harris campaign is doing to try to wi...     NPR     NaN   
 1     Some Democrats are still hesitant to vote for ...     NPR     NaN   
 2     Harris releases medical report, drawing anothe...     NPR     NaN   
 3     6 facts about false noncitizen voting claims a...     NPR     NaN   
 4     The Justice Department says Virginia is illega...     NPR     NaN   
 ...                                                 ...     ...     ...   
 1962  Taylor Swift Endorses Kamala Harris: The Billi...  Forbes     NaN   
 1963  Harris And Trump's Biggest Celebrity Endorseme...  Forbes     NaN   
 1964  Harris Slams Trump For Killing Border Bill In ...  Forbes     NaN   
 1965  Trump Falsely Claims Inflation 'Worst In Our N...  Forbes     NaN   
 1966  Questions For The Kamala Harris And Donald Tru...  Forbes     NaN   
 
                                                     url           publish