**Members:** Ethan Wong, Timmy Ren, Mason Shu, Medha Nalamada, Carson Mullen, Bethel Kim

**Morning Cohort**: 11 AM - 1 PM

**Current Tasks Remaining:**

1. Verify Relevant Actors Code --> Timmy
2. Perform LDA --> Mason
3. Perform Active/Passive/Topic Voice Analysis --> Timmy
4. Use ChatGPT to generate sentiment for actors in a given category (-100 to 100) --> Ethan
5. Create a logical evaluation score metric based on active/passive voice, sentiment, title, and number of articles about certain subjects etc.(Come up with a few options) --> Bethel and Medha (Timmy and Mason will talk to Barua)

# Install and Load Libraries

In [3]:
#!pip install eventregistry
!python -m spacy download en_core_web_lg
#!python -m spacy download en_core_web_sm
from eventregistry import *
import pandas as pd
import json
import re
import spacy # type: ignore

Collecting en-core-web-lg==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 12.2 MB/s eta 0:00:01
     ---------------- ----------------------- 5.2/12.8 MB 13.3 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 22.5 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 19.6 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0

# Scraping Articles with Event Registry API - Ethan

In [2]:
# Load the API key from the JSON file
with open("config.json", "r") as file:
    config = json.load(file)
api_key = config["api_key"]

# Initialize EventRegistry with the API key
er = EventRegistry(apiKey=api_key, allowUseOfArchive=False)

# Define topics to search for
#topics = [
    #"Donald Trump", "Kamala Harris", "Israel", "Palestine", "Palestinians", "Hamas", "FEMA", "Abortion",
    #"Inflation", "Unemployment", "Economy", "Dockworkers", "ILA Port", "Immigration"
#]

topics = ["Election", "Gaza", "FEMA", "Abortion", "Inflation", "Unemployment", "Dockworkers", "Immigration"]

# Define sources to search for and get their URIs
source_names = ["NPR", "MSNBC", "AP News", "FOX", "Forbes"]
source_uris = {source: er.getNewsSourceUri(source) for source in source_names}

# List to store the names of all generated DataFrames
dataframe_names = []

# Loop through each topic
for topic in topics:
    # Get the URI for the concept
    concept_uri = er.getConceptUri(topic)
    
    # List to hold all articles' data for the current topic (across all sources)
    articles_data = []
    
    # Loop through each source individually
    for source_name, source_uri in source_uris.items():
        # Define the query for each topic and source
        q = QueryArticlesIter(
            conceptUri=concept_uri,
            sourceUri=source_uri,
            sourceLocationUri=er.getLocationUri("United States"),  # Only US sources
        )

        # Fetch and accumulate up to 500 articles for the current topic from this source
        for art in q.execQuery(er, sortBy="date", maxItems=500):
            articles_data.append({
                "title": art.get("title"),
                "source": art.get("source", {}).get("title"),
                "author": art.get("author"),
                "url": art.get("url"),
                "publishedAt": art.get("dateTime"),
                "content": art.get("body")
            })

    # Create a single DataFrame for the current topic with articles from all sources
    articles_df = pd.DataFrame(articles_data)
    
    # Save the DataFrame to a CSV file
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    articles_df.to_csv(file_name, index=False)

    # Dynamically set the DataFrame name based on the topic, replacing spaces with underscores
    df_name = f"{topic.replace(' ', '_')}_df"
    globals()[df_name] = articles_df

    # Append the DataFrame name to the list
    dataframe_names.append(df_name)

# Print the list of all generated DataFrame names
print("Generated DataFrames:", dataframe_names)

Generated DataFrames: ['Election_df', 'Gaza_df', 'FEMA_df', 'Abortion_df', 'Inflation_df', 'Unemployment_df', 'Dockworkers_df', 'Immigration_df']


# Cleaning Dataframes and Running Named Entity Recognition - Mason and Carson

In [4]:
# Reading in CSVs dynamically

topics = ["Election", "Gaza", "FEMA", "Abortion", "Inflation", "Unemployment", "Dockworkers", "Immigration"]

# Dictionary to hold the DataFrames after reading them from CSV
dataframes = {}

# Loop to read each CSV and store the DataFrame in the dictionary
for topic in topics:
    # Replace spaces with underscores to match your file naming convention
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    try:
        dataframes[topic.replace(' ', '_')] = pd.read_csv(file_name)
    except FileNotFoundError as e:
        print(f"Error: {e}")  # If a file is not found

In [5]:
# Clean the dataframes
def count_sentences(text):
    sentences = re.split(r'[.!?]+', text)
    return len([s for s in sentences if s.strip()])

def clean_df(df):
    df = df.drop_duplicates().copy()
    df.loc[:, 'content'] = df['content'].str.replace(
        "By entering your email and pushing continue, you are agreeing to Fox News\' Terms of Use and Privacy Policy, which includes our Notice of Financial Incentive.\n\n", 
        "", 
        regex=False
    )
    df.loc[:, 'num_sentences'] = df['content'].apply(count_sentences)
    return df

# Apply clean_df to each DataFrame in the dictionary
cleaned_dataframes = {key: clean_df(df) for key, df in dataframes.items()}

In [None]:
# Mason

# Finding relevant actors with noun extraction

nlp = spacy.load('en_core_web_lg')

# Function to extract nouns and proper nouns
def extract_nouns(text):
    if pd.isnull(text):  # Handle missing values
        return []
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"] and token.is_alpha and len(token.text) > 2]
    return nouns

# Loop through all DataFrames
for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Apply the noun extraction to both the 'title' and 'content' columns
    df['Title_Nouns'] = df['title'].apply(extract_nouns)
    df['Content_Nouns'] = df['content'].apply(extract_nouns)

    # Combine all noun lists into one for counting
    all_nouns = df['Title_Nouns'].sum() + df['Content_Nouns'].sum()  # Flatten lists so that the noun lists from both the Title_Nouns and Content_Nouns columns are in a single list
    
    # Calculate the value counts
    noun_counts = pd.Series(all_nouns).value_counts()
    
    # Output the top 10 most frequent nouns
    print(f"Top nouns for {topic}:\n", noun_counts.head(10))

    # Store the counts for further analysis
    cleaned_dataframes[topic]['Noun_Counts'] = noun_counts

In [8]:
# Mason 

# NER, to find the major parties in each topic

# Initialize the NLP model 
nlp = spacy.load("en_core_web_lg")

# Function to extract all named entities from a list of texts
def extract_entities_batch(texts):
    docs = nlp.pipe(texts, batch_size=50)  # Adjust batch_size as needed
    entities_list = []
    for doc in docs:
        entities = [(ent.text, ent.label_) for ent in doc.ents]  # Get all entities and their types
        entities_list.append(entities)
    return entities_list

# Loop through all DataFrames
for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Apply the NER to the 'title' and 'content' columns using batch processing
    df['Title_Entities'] = extract_entities_batch(df['title'].tolist())
    df['Content_Entities'] = extract_entities_batch(df['content'].tolist())

    # Combine all entity lists into one for counting
    all_entities = df['Title_Entities'].sum() + df['Content_Entities'].sum()  # Combine title and content
    
    # Calculate the value counts
    entity_texts = [ent[0] for ent in all_entities]  # Extract just the text of the entities
    entity_counts = pd.Series(entity_texts).value_counts()
    
    # Output the top 10 most frequent entities
    print(f"Top entities for {topic}:\n", entity_counts.head(10))

    # Store the entity counts
    cleaned_dataframes[topic]['Entity_Counts'] = entity_counts

Processing topic: Election
Top entities for Election:
 Trump           7611
Harris          4119
Biden           1937
U.S.            1871
Republican      1760
first           1541
Democrats       1338
one             1266
Republicans     1254
Donald Trump    1236
Name: count, dtype: int64
Processing topic: Gaza
Top entities for Gaza:
 Israel       3739
Hezbollah    1731
Israeli      1514
Gaza         1339
Lebanon      1300
Hamas        1143
Iran          875
U.S.          820
Biden         510
Harris        459
Name: count, dtype: int64
Processing topic: FEMA
Top entities for FEMA:
 first               1718
Trump               1368
one                 1245
Harris              1180
two                 1130
three                621
U.S.                 611
AP                   591
Fox News Digital     534
second               393
Name: count, dtype: int64
Processing topic: Abortion
Top entities for Abortion:
 Trump           2790
Harris          1889
Republican       638
Vance          

In [9]:
# Timmy

#nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

# Function to extract named entities from a document
def extract_entities_from_docs(docs):
    entity_lists = []
    for doc in docs:
        entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "LOC"] and ent.text.isalpha() and len(ent.text) > 2]
        entity_lists.append(entities)
    return entity_lists

# Loop through all DataFrames
for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Filter out rows with missing data in 'title' and 'content'
    df = df.dropna(subset=['title', 'content'])

    # nlp.pipe is btch processing
    title_docs = nlp.pipe(df['title'], disable=["textcat"])
    content_docs = nlp.pipe(df['content'], disable=["textcat"])
    
    df['Title_Entities'] = extract_entities_from_docs(title_docs)
    df['Content_Entities'] = extract_entities_from_docs(content_docs)

    # Combine all entity lists into one for counting
    all_entities = df['Title_Entities'].sum() + df['Content_Entities'].sum()
    
    # Calculate the value counts
    entity_counts = pd.Series(all_entities).value_counts()
    
    # Output the top 10 most frequent entities
    print(f"Top entities for {topic}:\n", entity_counts.head(10))

    # Store the counts for further analysis
    cleaned_dataframes[topic]['Entity_Counts'] = entity_counts

Processing topic: Election
Top entities for Election:
 Trump           7078
Harris          4119
Biden           1937
Vance            963
GOP              826
Georgia          787
Congress         659
Walz             654
Pennsylvania     638
Israel           618
Name: count, dtype: int64
Processing topic: Gaza
Top entities for Gaza:
 Israel       3739
Hezbollah    1708
Gaza         1339
Lebanon      1300
Hamas        1143
Iran          875
Biden         510
Harris        459
Trump         411
Beirut        288
Name: count, dtype: int64
Processing topic: FEMA
Top entities for FEMA:
 Trump         1237
Harris        1180
Georgia        357
Biden          355
California     294
Combs          293
Texas          282
Israel         274
America        253
Florida        234
Name: count, dtype: int64
Processing topic: Abortion
Top entities for Abortion:
 Trump       2572
Harris      1889
Vance        610
Biden        537
Walz         387
GOP          315
Georgia      295
Senate       258
Ro

In [10]:
# Timmy

#nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

# Function to extract named entities from a document
def extract_entities_from_docs(docs):
    entity_lists = []
    for doc in docs:
        entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "LOC"] and ent.text.isalpha() and len(ent.text) > 2]
        entity_lists.append(entities)
    return entity_lists

# Loop through all DataFrames
for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Filter out rows with missing data in 'title' and 'content'
    df = df.dropna(subset=['title', 'content'])

    # Batch processing using nlp.pipe
    title_docs = nlp.pipe(df['title'], batch_size=50, disable=["textcat"])
    content_docs = nlp.pipe(df['content'], batch_size=50, disable=["textcat"])
    
    df['Title_Entities'] = extract_entities_from_docs(title_docs)
    df['Content_Entities'] = extract_entities_from_docs(content_docs)

    # Combine all entity lists into one for counting
    all_entities = df['Title_Entities'].sum() + df['Content_Entities'].sum()
    
    # Calculate the value counts
    entity_counts = pd.Series(all_entities).value_counts()
    
    # Output the top 10 most frequent entities
    print(f"Top entities for {topic}:\n", entity_counts.head(10))

    # Store the counts for further analysis
    cleaned_dataframes[topic]['Entity_Counts'] = entity_counts

Processing topic: Election
Top entities for Election:
 Trump           7078
Harris          4119
Biden           1937
Vance            963
GOP              826
Georgia          787
Congress         659
Walz             654
Pennsylvania     638
Israel           618
Name: count, dtype: int64
Processing topic: Gaza
Top entities for Gaza:
 Israel       3739
Hezbollah    1708
Gaza         1339
Lebanon      1300
Hamas        1143
Iran          875
Biden         510
Harris        459
Trump         411
Beirut        288
Name: count, dtype: int64
Processing topic: FEMA
Top entities for FEMA:
 Trump         1237
Harris        1180
Georgia        357
Biden          355
California     294
Combs          293
Texas          282
Israel         274
America        253
Florida        234
Name: count, dtype: int64
Processing topic: Abortion
Top entities for Abortion:
 Trump       2572
Harris      1889
Vance        610
Biden        537
Walz         387
GOP          315
Georgia      295
Senate       258
Ro