**Members:** Ethan Wong, Timmy Ren, Mason Shu, Medha Nalamada, Carson Mullen, Bethel Kim

**Morning Cohort**: 11 AM - 1 PM

**Current Tasks Remaining:**

1. Verify Relevant Actors Code --> Timmy
2. Perform LDA --> Mason
3. Perform Active/Passive/Topic Voice Analysis --> Timmy
4. Use ChatGPT to generate sentiment for actors in a given category (-100 to 100) --> Ethan
5. Create a logical evaluation score metric based on active/passive voice, sentiment, title, and number of articles about certain subjects etc.(Come up with a few options) --> Bethel and Medha (Timmy and Mason will talk to Barua)

# Install and Load Libraries

In [11]:
#!pip install eventregistry
!python -m spacy download en_core_web_lg
from eventregistry import *
import pandas as pd
import json
import re
import spacy # type: ignore

# Scraping Articles with Event Registry API - Ethan

In [None]:
# Load the API key from the JSON file
with open("config.json", "r") as file:
    config = json.load(file)
api_key = config["api_key"]

# Initialize EventRegistry with the API key
er = EventRegistry(apiKey=api_key, allowUseOfArchive=False)

# Define topics to search for
topics = [
    "Donald Trump", "Kamala Harris", "Israel", "Palestine", "Palestinians", "Hamas", "FEMA", "Abortion",
    "Inflation", "Unemployment", "Economy", "Dockworkers", "ILA Port", "Immigration"
]

# Define sources to search for and get their URIs
source_names = ["NPR", "MSNBC", "AP News", "FOX", "Forbes"]
source_uris = {source: er.getNewsSourceUri(source) for source in source_names}

# List to store the names of all generated DataFrames
dataframe_names = []

# Loop through each topic
for topic in topics:
    # Get the URI for the concept
    concept_uri = er.getConceptUri(topic)
    
    # List to hold all articles' data for the current topic (across all sources)
    articles_data = []
    
    # Loop through each source individually
    for source_name, source_uri in source_uris.items():
        # Define the query for each topic and source
        q = QueryArticlesIter(
            conceptUri=concept_uri,
            sourceUri=source_uri,
            sourceLocationUri=er.getLocationUri("United States"),  # Only US sources
        )

        # Fetch and accumulate up to 500 articles for the current topic from this source
        for art in q.execQuery(er, sortBy="date", maxItems=500):
            articles_data.append({
                "title": art.get("title"),
                "source": art.get("source", {}).get("title"),
                "author": art.get("author"),
                "url": art.get("url"),
                "publishedAt": art.get("dateTime"),
                "content": art.get("body")
            })

    # Create a single DataFrame for the current topic with articles from all sources
    articles_df = pd.DataFrame(articles_data)
    
    # Save the DataFrame to a CSV file
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    articles_df.to_csv(file_name, index=False)

    # Dynamically set the DataFrame name based on the topic, replacing spaces with underscores
    df_name = f"{topic.replace(' ', '_')}_df"
    globals()[df_name] = articles_df

    # Append the DataFrame name to the list
    dataframe_names.append(df_name)

# Print the list of all generated DataFrame names
print("Generated DataFrames:", dataframe_names)

# Combine Related Topics into Larger Dataframes and Cleaning Dataframes - Mason and Carson

In [18]:
# Combine related dataframes

# Read in csv's
# List of topics
topics = [
    "Donald Trump", "Kamala Harris", "Israel", "Palestine", "Palestinians", "Hamas", "FEMA", "Abortion",
    "Inflation", "Unemployment", "Economy", "Dockworkers", "ILA Port", "Immigration"
]

# Dictionary to hold the DataFrames after reading them from CSV
dataframes = {}

# Loop to read each CSV and store the DataFrame in the dictionary
for topic in topics:
    # Replace spaces with underscores to match your file naming convention
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    try:
        dataframes[topic.replace(' ', '_')] = pd.read_csv(file_name)
    except FileNotFoundError as e:
        print(f"Error: {e}")  # If a file is not found

# Merge Donald Trump and Kamala Harris
Donald_Trump_Kamala_Harris_df = pd.concat([dataframes["Donald_Trump"], dataframes["Kamala_Harris"]])

# Merge Israel, Palestine, Palestinians, and Hamas
Israel_Palestine_Palestinians_Hamas_df = pd.concat([
    dataframes["Israel"], dataframes["Palestine"], dataframes["Palestinians"], dataframes["Hamas"]
])

# Merge Inflation, Unemployment, and Economy
Inflation_Unemployment_Economy_df = pd.concat([
    dataframes["Inflation"], dataframes["Unemployment"], dataframes["Economy"]
])

# Merge Dockworkers and ILA Port
Dockworkers_ILA_Port_df = pd.concat([
    dataframes["Dockworkers"], dataframes["ILA_Port"]
])

# Add the merged dfs to dataframes
dataframes["Donald_Trump_Kamala_Harris"] = Donald_Trump_Kamala_Harris_df
dataframes["Israel_Palestine_Palestinians_Hamas"] = Israel_Palestine_Palestinians_Hamas_df
dataframes["Inflation_Unemployment_Economy"] = Inflation_Unemployment_Economy_df
dataframes["Dockworkers_ILA_Port"] = Dockworkers_ILA_Port_df

# Remove topics that became merged
del_keys = ["Donald_Trump", "Kamala_Harris", "Israel", "Palestine", "Palestinians", "Hamas",
            "Inflation", "Unemployment", "Economy", "Dockworkers", "ILA_Port"]
for key in del_keys:
    del dataframes[key]

In [19]:
# Clean the dataframes
def count_sentences(text):
    sentences = re.split(r'[.!?]+', text)
    return len([s for s in sentences if s.strip()])

def clean_df(df):
    df = df.drop_duplicates().copy()
    df.loc[:, 'content'] = df['content'].str.replace(
        "By entering your email and pushing continue, you are agreeing to Fox News\' Terms of Use and Privacy Policy, which includes our Notice of Financial Incentive.\n\n", 
        "", 
        regex=False
    )
    df.loc[:, 'num_sentences'] = df['content'].apply(count_sentences)
    return df

# Apply clean_df to each DataFrame in the dictionary
cleaned_dataframes = {key: clean_df(df) for key, df in dataframes.items()}

In [20]:
# Finding relevant actors with noun extraction
# Load spaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract nouns and proper nouns
def extract_nouns(text):
    if pd.isnull(text):  # Handle missing values
        return []
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"] and token.is_alpha and len(token.text) > 2]
    return nouns

# Loop through all DataFrames
for topic, df in cleaned_dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Apply the noun extraction to both the 'title' and 'content' columns
    df['Title_Nouns'] = df['title'].apply(extract_nouns)
    df['Content_Nouns'] = df['content'].apply(extract_nouns)

    # Combine all noun lists into one for counting
    all_nouns = df['Title_Nouns'].sum() + df['Content_Nouns'].sum()  # Flatten lists so that the noun lists from both the Title_Nouns and Content_Nouns columns are in a single list
    
    # Calculate the value counts
    noun_counts = pd.Series(all_nouns).value_counts()
    
    # Output the top 10 most frequent nouns
    print(f"Top nouns for {topic}:\n", noun_counts.head(10))

    # Store the counts for further analysis
    cleaned_dataframes[topic]['Noun_Counts'] = noun_counts

Processing topic: FEMA
Top nouns for FEMA:
 people    2470
women     2359
woman     2216
year      2035
time      1933
Trump     1880
years     1591
Harris    1590
life      1225
state     1135
Name: count, dtype: int64
Processing topic: Abortion
Top nouns for Abortion:
 Trump        3574
Harris       2608
abortion     2331
state        1280
debate       1141
voters       1095
President     965
people        905
Vance         824
women         810
Name: count, dtype: int64
Processing topic: Immigration
Top nouns for Immigration:
 Trump          5218
Harris         3717
people         2147
border         1598
President      1576
Biden          1491
immigration    1360
debate         1265
year           1257
voters         1176
Name: count, dtype: int64
Processing topic: Donald_Trump_Kamala_Harris
Top nouns for Donald_Trump_Kamala_Harris:
 Trump        16034
Harris        9368
President     4762
election      3946
Biden         3915
people        3854
campaign      3254
president     322