**Members:** Ethan Wong, Timmy Ren, Mason Shu, Medha Nalamada, Carson Mullen, Bethel Kim

**Morning Cohort**: 11 AM - 1 PM

# Install and Load Libraries

In [48]:
#!pip install eventregistry
!python -m spacy download en_core_web_lg
#!python -m spacy download en_core_web_sm
from eventregistry import *
import pandas as pd
import json
import re
import spacy # type: ignore
from spacy.matcher import Matcher
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     - ------------------------------------ 14.7/400.7 MB 83.8 MB/s eta 0:00:05
     --- ---------------------------------- 32.5/400.7 MB 82.5 MB/s eta 0:00:05
     ---- --------------------------------- 48.8/400.7 MB 81.6 MB/s eta 0:00:05
     ----- -------------------------------- 57.4/400.7 MB 70.3 MB/s eta 0:00:05
     ------ ------------------------------- 70.3/400.7 MB 68.9 MB/s eta 0:00:05
     ------- ------------------------------ 83.1/400.7 MB 68.0 MB/s eta 0:00:05
     -------- ----------------------------- 89.1/400.7 MB 62.5 MB/s eta 0:00:05
     --------- ---------------------------- 96.5/400.7 MB 58.6 MB/s eta 0:00:06
     --------- --------------------------- 107.0/400.7 MB 57.9 MB/s eta 0:00:06
     ---------- --------------

# Scraping Articles with Event Registry API

In [50]:
# Load the API key from the JSON file
with open("config.json", "r") as file:
    config = json.load(file)
api_key = config["api_key"]

# Initialize EventRegistry with the API key
er = EventRegistry(apiKey=api_key, allowUseOfArchive=False)

# Define topics to search for
topics = ["Election"]

# Define sources to search for and get their URIs
source_names = source_names = ["NPR", "MSNBC", "AP News", "FOX", "Forbes", "New York Times", "Bloomberg", "USA Today", "Washington Post", "Politico", "Vox", "Oan", "Breitbart", "Wall Street Journal"]
source_uris = {source: er.getNewsSourceUri(source) for source in source_names}

# List to store the names of all generated DataFrames
dataframe_names = []

# Loop through each topic
for topic in topics:
    # Get the URI for the concept
    concept_uri = er.getConceptUri(topic)
    
    # List to hold all articles' data for the current topic (across all sources)
    articles_data = []
    
    # Loop through each source individually
    for source_name, source_uri in source_uris.items():
        # Define the query for each topic and source
        q = QueryArticlesIter(
            conceptUri=concept_uri,
            sourceUri=source_uri,
            sourceLocationUri=er.getLocationUri("United States"),  # Only US sources
        )

        # Fetch and accumulate up to 1000 articles for the current topic from this source
        for art in q.execQuery(er, sortBy="date", maxItems=1000):
            articles_data.append({
                "title": art.get("title"),
                "source": art.get("source", {}).get("title"),
                "author": art.get("author"),
                "url": art.get("url"),
                "publishedAt": art.get("dateTime"),
                "content": art.get("body")
            })

    # Create a single DataFrame for the current topic with articles from all sources
    articles_df = pd.DataFrame(articles_data)
    
    # Save the DataFrame to a CSV file
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    articles_df.to_csv(file_name, index=False)

    # Dynamically set the DataFrame name based on the topic, replacing spaces with underscores
    df_name = f"{topic.replace(' ', '_')}_df"
    globals()[df_name] = articles_df

    # Append the DataFrame name to the list
    dataframe_names.append(df_name)

# Print the list of all generated DataFrame names
print("Generated DataFrames:", dataframe_names)

Generated DataFrames: ['Election_df']


In [51]:
Election_df['source'].value_counts()

source
Fox News                    905
The New York Times          823
AP NEWS                     779
Washington Post             726
POLITICO                    710
USA Today                   507
Breitbart                   505
Forbes                      442
Bloomberg Business          410
MSNBC.com                   259
NPR                         220
One America News Network     88
The Wall Street Journal      45
Name: count, dtype: int64

# Cleaning Dataframes and Running Name Entity Recognition

In [52]:
# Reading in CSVs dynamically

topics = ["Election"]

# Dictionary to hold the DataFrames after reading them from CSV
dataframes = {}

# Loop to read each CSV and store the DataFrame in the dictionary
for topic in topics:
    # Replace spaces with underscores to match your file naming convention
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    try:
        dataframes[topic.replace(' ', '_')] = pd.read_csv(file_name)
    except FileNotFoundError as e:
        print(f"Error: {e}")  # If a file is not found

In [53]:
# Clean the dataframes
def count_sentences(text):
    sentences = re.split(r'[.!?]+', text)
    return len([s for s in sentences if s.strip()])

def clean_df(df):
    df = df.drop_duplicates().copy()
    df.loc[:, 'content'] = df['content'].str.replace(
        "By entering your email and pushing continue, you are agreeing to Fox News\' Terms of Use and Privacy Policy, which includes our Notice of Financial Incentive.\n\n", 
        "", 
        regex=False
    )
    df.loc[:, 'num_sentences'] = df['content'].apply(count_sentences)
    return df

# Apply clean_df to each DataFrame in the dictionary
cleaned_dataframes = {key: clean_df(df) for key, df in dataframes.items()}

In [54]:
# Running name entity recognition on the data
nlp = spacy.load("en_core_web_lg")

# Function to extract named entities from a document
def extract_entities_from_docs(docs):
    entity_lists = []
    for doc in docs:
        entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "LOC"] and ent.text.isalpha() and len(ent.text) > 2]
        entity_lists.append(entities)
    return entity_lists

# Loop through all DataFrames
for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Filter out rows with missing data in 'title' and 'content'
    df = df.dropna(subset=['title', 'content'])

    # nlp.pipe is btch processing
    title_docs = nlp.pipe(df['title'], disable=["textcat"])
    content_docs = nlp.pipe(df['content'], disable=["textcat"])
    
    df['Title_Entities'] = extract_entities_from_docs(title_docs)
    df['Content_Entities'] = extract_entities_from_docs(content_docs)

    # Combine all entity lists into one for counting
    all_entities = df['Title_Entities'].sum() + df['Content_Entities'].sum()
    
    # Calculate the value counts
    entity_counts = pd.Series(all_entities).value_counts()
    
    # Output the top 10 most frequent entities
    print(f"Top entities for {topic}:\n", entity_counts.head(10))

    # Store the counts for further analysis
    cleaned_dataframes[topic]['Entity_Counts'] = entity_counts

Processing topic: Election
Top entities for Election:
 Trump           25979
Harris          14472
Biden            7055
Vance            3639
Senate           2773
Israel           2675
House            2608
Pennsylvania     2595
GOP              2472
Florida          2348
Name: count, dtype: int64


Based on the data, we determined that these were the relevant actors for each topic:

* **Election:** Trump, Harris, Biden

In order to process the articles more efficiently for subsequent steps, we decided to remove any sentences that do not contain the above words. Our reasoning is that now, the articles will be smaller in size while still retaining all of the articles, and that passive/active voice is not getting affected by non-actor sentences.

In [55]:
# Find and replace for each of the sentences
# For article in a given topic, remove any sentences that do not contain these words. 
# Idea is that articles are smaller and easier to process while still retaining the articles 
# Passive/Active voice is not getting affected by non-actor sentences

In [56]:
# Define the keywords and find-and-replace mappings for each topic
topic_keywords = {
    "Election": {
        "keywords": ["Trump", "Harris", "Biden"],
    },
}

# Function to perform find-and-replace operations
def apply_find_replace(text, find_replace_dict):
    # Build a mapping from lowercased keys to their replacements
    lower_find_replace = {k.lower(): v for k, v in find_replace_dict.items()}
    # Escape special characters in keys for regex
    escaped_keys = [re.escape(k) for k in find_replace_dict.keys()]
    pattern = re.compile("|".join(escaped_keys), re.IGNORECASE)
    
    def replace_match(m):
        matched_text = m.group(0)
        # Lookup the replacement using the lowercase matched text
        replacement = lower_find_replace.get(matched_text.lower(), matched_text)
        return replacement
    
    return pattern.sub(replace_match, text)

# Function to filter sentences based on keywords and ensure a keyword threshold is met
def filter_sentences(text, keywords, threshold=1):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split text into sentences
    keyword_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
    
    filtered_sentences = []
    keyword_count = 0
    
    for sent in sentences:
        if keyword_pattern.search(sent):
            filtered_sentences.append(sent)
            keyword_count += len(keyword_pattern.findall(sent))  # Count occurrences of keywords
    
    # Only return the filtered sentences if the keyword count meets or exceeds the threshold
    if keyword_count >= threshold:
        return ' '.join(filtered_sentences)
    else:
        return ""  # Return empty if the threshold isn't met


In [57]:
# Apply the filtering to each DataFrame
threshold = 3  # Set the threshold for how many keywords need to be found to retain the content

for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    df = df.copy()
    
    # Get keywords and find-and-replace mappings for the topic
    keywords = topic_keywords.get(topic, {}).get('keywords', [])
    find_replace = topic_keywords.get(topic, {}).get('find_replace', {})
    
    # Apply find-and-replace and sentence filtering to 'content' column
    filtered_contents = []
    for content in df['content']:
        # Skip if content is NaN
        if pd.isnull(content):
            filtered_contents.append(content)
            continue
        
        # Apply find-and-replace operations
        if find_replace:
            content = apply_find_replace(content, find_replace)
        
        # Filter sentences based on keywords and threshold
        filtered_content = filter_sentences(content, keywords, threshold)
        filtered_contents.append(filtered_content)
    
    # Update the DataFrame with the filtered content in the "content" column
    df['content'] = filtered_contents
    cleaned_dataframes[topic] = df

    # Save the updated DataFrame to a new CSV file (optional)
    # df.to_csv(f"{topic.replace(' ', '_')}_filtered_articles.csv", index=False)

Processing topic: Election


In [58]:
# Create a second version of cleaned_dataframes where articles with empty 'content' are removed
cleaned_dataframes_filtered = {}
for topic, df in cleaned_dataframes.items():
    # Remove rows where 'content' is empty or contains only whitespace
    df_filtered = df[df['content'].str.strip().astype(bool)].copy()
    cleaned_dataframes_filtered[topic] = df_filtered

    # Save the filtered DataFrame to a new CSV file (second version)
    # df_filtered.to_csv(f"{topic.replace(' ', '_')}_filtered_articles_no_empty.csv", index=False)

# Print the number of articles before and after filtering
for topic in topic_keywords.keys():
    original_count = len(cleaned_dataframes[topic])
    filtered_count = len(cleaned_dataframes_filtered[topic])
    print(f"Topic: {topic}, Original Articles: {original_count}, After Filtering: {filtered_count}")


Topic: Election, Original Articles: 6419, After Filtering: 4212


In [59]:
cleaned_dataframes_filtered['Election']['content'][5]

'This presidential campaign, former President Donald Trump and other Republicans have been repeating the false narrative that Democrats are purposefully letting migrants into the country so they will vote. Nevertheless, Trump seized on and distorted Richman\'s estimates to fuel false claims in 2016 that millions of noncitizens had illegally voted. One false narrative this campaign season suggests that the people who arrived at the U.S.-Mexico border during the Biden administration can quickly become citizens and vote legally. Furthermore, changes to asylum protocols during the Biden administration have made it harder to pursue asylum in this country and eventually become a citizen. By focusing on baseless allegations about noncitizens voting in the upcoming election, Trump and his allies appear to be laying the groundwork for potentially contesting the election. "You can absolutely bet if Trump loses, he will claim there was widespread noncitizen voting without any evidence whatsoever,

# Performing Bias Analysis

In [60]:
df = cleaned_dataframes_filtered['Election']

In [61]:
df['source'].value_counts()

source
Fox News                    754
Washington Post             564
POLITICO                    554
The New York Times          529
USA Today                   391
Breitbart                   374
AP NEWS                     343
Forbes                      232
MSNBC.com                   222
NPR                         124
One America News Network     61
Bloomberg Business           54
The Wall Street Journal      10
Name: count, dtype: int64

In [62]:
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Assume 'df' is your DataFrame with columns: title, source, content, num_sentences

# Define the main actors
actors = ['Trump', 'Harris', 'Biden']

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Define the passive voice pattern
passive_rule = [
    {'DEP': 'nsubjpass'},    # Passive nominal subject
    {'DEP': 'aux', 'OP': '*'},   # Optional auxiliary verbs
    {'DEP': 'auxpass'},      # Passive auxiliary
    {'TAG': 'VBN'}           # Past participle verb
]
matcher.add('Passive', [passive_rule])

# Function to detect passive voice using Matcher
def is_passive(doc):
    matches = matcher(doc)
    return bool(matches)

# Batch processing function to handle multiple articles at once
def process_batch(batch_df):
    actor_scores_batch = {}

    # Filter out rows where 'title' or 'content' is missing or empty
    batch_df_filtered = batch_df.dropna(subset=['title', 'content']).copy()
    batch_df_filtered = batch_df_filtered[batch_df_filtered['title'].str.strip() != ""]
    batch_df_filtered = batch_df_filtered[batch_df_filtered['content'].str.strip() != ""]
    
    # Ensure filtered DataFrame is reindexed to avoid index mismatch
    batch_df_filtered.reset_index(drop=True, inplace=True)

    # Process titles and contents together for each article in the batch
    docs_titles = list(nlp.pipe(batch_df_filtered['title'].tolist(), batch_size=batch_size))
    docs_contents = list(nlp.pipe(batch_df_filtered['content'].tolist(), batch_size=batch_size))

    # Process each article in the filtered batch
    for idx, row in batch_df_filtered.iterrows():
        title_doc = docs_titles[idx]
        content_doc = docs_contents[idx]
        source = row['source']
        
        # Initialize actor score dictionary for the source if not present
        if source not in actor_scores_batch:
            actor_scores_batch[source] = {actor: {'title_scores': [], 'content_scores': []} for actor in actors}

        # Function to process sentences (for both content and title)
        def process_sentences(sentences, score_type):
            relevant_sentences = []
            for sentence in sentences:
                if any(actor in sentence.text for actor in actors):
                    relevant_sentences.append(sentence)

            # Process each relevant sentence
            for sentence in relevant_sentences:
                for actor in actors:
                    if actor in sentence.text:
                        # Determine voice score
                        voice_score = -1 if is_passive(sentence) else 1

                        # Determine sentiment score using VADER
                        sentiment_scores = sia.polarity_scores(sentence.text)
                        sentiment = sentiment_scores['compound']  # Compound score between -1 and 1

                        # Multiply voice and sentiment scores
                        score = voice_score * sentiment

                        # Append the score to the actor's list for the source
                        actor_scores_batch[source][actor][score_type].append(score)
        
        # Process title and content sentences in batch
        process_sentences(title_doc.sents, 'title_scores')
        process_sentences(content_doc.sents, 'content_scores')

    return actor_scores_batch

# Batch-level processing setup
batch_size = 100  # Define the batch size
actor_scores_per_source = {}

# Process the DataFrame in batches
for start in range(0, len(df), batch_size):
    batch_df = df.iloc[start:start + batch_size]
    
    # Process each batch
    batch_actor_scores = process_batch(batch_df)
    
    # Merge the batch scores with the overall actor scores
    for source, actor_scores in batch_actor_scores.items():
        if source not in actor_scores_per_source:
            actor_scores_per_source[source] = actor_scores
        else:
            for actor in actors:
                actor_scores_per_source[source][actor]['title_scores'].extend(actor_scores[actor]['title_scores'])
                actor_scores_per_source[source][actor]['content_scores'].extend(actor_scores[actor]['content_scores'])

# Calculate average scores for each actor per source
average_actor_scores_per_source = {}
for source, actor_scores in actor_scores_per_source.items():
    average_actor_scores_per_source[source] = {}
    for actor, scores in actor_scores.items():
        avg_title_score = sum(scores['title_scores']) / len(scores['title_scores']) if scores['title_scores'] else 0
        avg_content_score = sum(scores['content_scores']) / len(scores['content_scores']) if scores['content_scores'] else 0
        average_actor_scores_per_source[source][actor] = {
            'average_title_score': avg_title_score,
            'average_content_score': avg_content_score
        }

# Display the average scores per source for both title and content
for source, actor_scores in average_actor_scores_per_source.items():
    print(f"Source: {source}")
    for actor, scores in actor_scores.items():
        print(f"  Average bias score for {actor} (title): {scores['average_title_score']}")
        print(f"  Average bias score for {actor} (content): {scores['average_content_score']}")


Source: NPR
  Average bias score for Trump (title): -0.09212619047619047
  Average bias score for Trump (content): 0.036158053691275165
  Average bias score for Harris (title): 0.0028480000000000007
  Average bias score for Harris (content): 0.09278663967611335
  Average bias score for Biden (title): -0.22666666666666668
  Average bias score for Biden (content): 0.008695256916996047
Source: MSNBC.com
  Average bias score for Trump (title): -0.180035593220339
  Average bias score for Trump (content): 0.00023195816385822223
  Average bias score for Harris (title): 0.029656818181818187
  Average bias score for Harris (content): 0.10562875
  Average bias score for Biden (title): 0.19733333333333333
  Average bias score for Biden (content): 0.024384433962264147
Source: AP NEWS
  Average bias score for Trump (title): -0.1215045045045045
  Average bias score for Trump (content): 0.02871786501985002
  Average bias score for Harris (title): -0.04120897435897436
  Average bias score for Harris (