In [1]:
#!pip install eventregistry
from eventregistry import *
import pandas as pd
import json

In [13]:
# Load the API key from the JSON file
with open("config.json", "r") as file:
    config = json.load(file)
api_key = config["api_key"]

# Initialize EventRegistry with the API key
er = EventRegistry(apiKey=api_key, allowUseOfArchive=False)

# Define topics to search for
topics = [
    "Donald Trump", "Kamala Harris", "Israel", "Palestine", "Palestinians", "Hamas", "FEMA", "Abortion",
    "Inflation", "Unemployment", "Economy", "Dockworkers", "ILA Port"
]

# Define sources to search for and get their URIs
source_names = ["NPR", "MSNBC", "AP News", "FOX", "Forbes"]
source_uris = {source: er.getNewsSourceUri(source) for source in source_names}

# List to store the names of all generated DataFrames
dataframe_names = []

# Loop through each topic
for topic in topics:
    # Get the URI for the concept
    concept_uri = er.getConceptUri(topic)
    
    # List to hold all articles' data for the current topic (across all sources)
    articles_data = []
    
    # Loop through each source individually
    for source_name, source_uri in source_uris.items():
        # Define the query for each topic and source
        q = QueryArticlesIter(
            conceptUri=concept_uri,
            sourceUri=source_uri,
            sourceLocationUri=er.getLocationUri("United States"),  # Only US sources
            dateStart="2024-10-01",  # Start date, adjust as needed
            dateEnd="2024-10-08"     # End date, adjust as needed
        )

        # Fetch and accumulate up to 500 articles for the current topic from this source
        for art in q.execQuery(er, sortBy="date", maxItems=500):
            articles_data.append({
                "title": art.get("title"),
                "source": art.get("source", {}).get("title"),
                "author": art.get("author"),
                "url": art.get("url"),
                "publishedAt": art.get("dateTime"),
                "content": art.get("body")
            })

    # Create a single DataFrame for the current topic with articles from all sources
    articles_df = pd.DataFrame(articles_data)
    
    # Save the DataFrame to a CSV file
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    articles_df.to_csv(file_name, index=False)

    # Dynamically set the DataFrame name based on the topic, replacing spaces with underscores
    df_name = f"{topic.replace(' ', '_')}_df"
    globals()[df_name] = articles_df

    # Append the DataFrame name to the list
    dataframe_names.append(df_name)

# Print the list of all generated DataFrame names
print("Generated DataFrames:", dataframe_names)

Generated DataFrames: ['Donald_Trump_df', 'Kamala_Harris_df', 'Israel_df', 'Palestine_df', 'Palestinians_df', 'Hamas_df', 'FEMA_df', 'Abortion_df', 'Inflation_df', 'Unemployment_df', 'Economy_df', 'Dockworkers_df', 'ILA_Port_df']


In [14]:
# Example: Access the "Donald Trump" DataFrame
Donald_Trump_df.head() # type: ignore

Unnamed: 0,title,source,author,url,publishedAt,content
0,Trump secretly sent Putin COVID-19 tests durin...,NPR,,https://www.npr.org/2024/10/08/nx-s1-5146501/t...,2024-10-08T23:07:08Z,Russian President Vladimir Putin and then Pres...
1,"As Florida braces for Milton's impact, FEMA ch...",NPR,,https://www.npr.org/2024/10/08/nx-s1-5144720/f...,2024-10-08T22:46:58Z,"<iframe src=""https://www.npr.org/player/embed/..."
2,Some Indian American Democrats see themselves ...,NPR,,https://www.npr.org/2024/10/08/nx-s1-5131223/s...,2024-10-08T21:57:10Z,"In the warm Georgia heat, Kannan Udayarajan is..."
3,Two Jewish Democrats reflect on Congressional ...,NPR,,https://www.npr.org/2024/10/08/nx-s1-5125793/j...,2024-10-08T20:41:44Z,"Reps. Jared Moskowitz, Kathy Manning, Josh Got..."
4,"Call Her Daddy, The View and 60 Minutes: Kamal...",NPR,,https://www.npr.org/2024/10/08/1210938239/podc...,2024-10-08T20:14:17Z,Podcast: Kamala Harris On Call Her Daddy podca...


In [17]:
df = pd.read_csv('Donald_Trump_articles.csv')
df['source'].value_counts()

source
Fox News     305
AP NEWS      156
Forbes       127
MSNBC.com    106
NPR           61
Name: count, dtype: int64

In [18]:
df1 = pd.read_csv('Abortion_articles.csv')
df1['source'].value_counts()

source
Fox News     64
AP NEWS      31
MSNBC.com    22
Forbes       20
NPR          12
Name: count, dtype: int64

In [None]:
# 1. Merge relevant dataframes - find relevant actors for each --> Mason
# 2. Data Cleaning/LDA --> Carson
# 2. Active/passive/topic voice code --> Timmy 
# 3. ChatGPT sentiment for actors in a given category (-100 to 100) --> Ethan/Timmy
# 4. Coming up with some sort of logical evaluation score metric based on active/passive voice, sentiment, title, and number of articles about certain subjects etc. --> Bethel/Medha (Come up with a few options)