In [20]:
#!pip install eventregistry
# !python -m spacy download en_core_web_lg
# from eventregistry import *
import pandas as pd
import json
# import spacy # type: ignore


In [22]:
# Load the API key from the JSON file
with open("config.json", "r") as file:
    config = json.load(file)
api_key = config["api_key"]

# Initialize EventRegistry with the API key
er = EventRegistry(apiKey=api_key, allowUseOfArchive=False)

# Define topics to search for
topics = [
    "Donald Trump", "Kamala Harris", "Israel", "Palestine", "Palestinians", "Hamas", "FEMA", "Abortion",
    "Inflation", "Unemployment", "Economy", "Dockworkers", "ILA Port", "Immigration"
]

# Define sources to search for and get their URIs
source_names = ["NPR", "MSNBC", "AP News", "FOX", "Forbes"]
source_uris = {source: er.getNewsSourceUri(source) for source in source_names}

# List to store the names of all generated DataFrames
dataframe_names = []

# Loop through each topic
for topic in topics:
    # Get the URI for the concept
    concept_uri = er.getConceptUri(topic)
    
    # List to hold all articles' data for the current topic (across all sources)
    articles_data = []
    
    # Loop through each source individually
    for source_name, source_uri in source_uris.items():
        # Define the query for each topic and source
        q = QueryArticlesIter(
            conceptUri=concept_uri,
            sourceUri=source_uri,
            sourceLocationUri=er.getLocationUri("United States"),  # Only US sources
        )

        # Fetch and accumulate up to 500 articles for the current topic from this source
        for art in q.execQuery(er, sortBy="date", maxItems=500):
            articles_data.append({
                "title": art.get("title"),
                "source": art.get("source", {}).get("title"),
                "author": art.get("author"),
                "url": art.get("url"),
                "publishedAt": art.get("dateTime"),
                "content": art.get("body")
            })

    # Create a single DataFrame for the current topic with articles from all sources
    articles_df = pd.DataFrame(articles_data)
    
    # Save the DataFrame to a CSV file
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    articles_df.to_csv(file_name, index=False)

    # Dynamically set the DataFrame name based on the topic, replacing spaces with underscores
    df_name = f"{topic.replace(' ', '_')}_df"
    globals()[df_name] = articles_df

    # Append the DataFrame name to the list
    dataframe_names.append(df_name)

# Print the list of all generated DataFrame names
print("Generated DataFrames:", dataframe_names)

Generated DataFrames: ['Donald_Trump_df', 'Kamala_Harris_df', 'Israel_df', 'Palestine_df', 'Palestinians_df', 'Hamas_df', 'FEMA_df', 'Abortion_df', 'Inflation_df', 'Unemployment_df', 'Economy_df', 'Dockworkers_df', 'ILA_Port_df', 'Immigration_df']


In [23]:
# Example: Access the "Donald Trump" DataFrame
Donald_Trump_df.head() # type: ignore

Unnamed: 0,title,source,author,url,publishedAt,content
0,The politics of natural disasters : The NPR Po...,NPR,,https://www.npr.org/2024/10/10/1210938546/poli...,2024-10-10T21:01:48Z,The politics of natural disasters : The NPR Po...
1,"How The Internet, Social Media, And Podcasts A...",NPR,,https://www.npr.org/2024/10/10/1210938545/1a-1...,2024-10-10T21:01:39Z,Jemele Hill (L) and Senator Kamala Harris spea...
2,"Ethel Kennedy, social activist and widow of Ro...",NPR,,https://www.npr.org/2024/10/10/nx-s1-5148861/e...,2024-10-10T15:53:45Z,"BOSTON, Mass. -- Ethel Kennedy, the widow of S..."
3,What matters to key swing state voters in Wisc...,NPR,,https://www.npr.org/2024/10/10/nx-s1-5143006/w...,2024-10-10T13:41:06Z,The Milwaukee Running Group -- OMG meets up on...
4,Politicians say health plans should cover IVF....,NPR,,https://www.npr.org/2024/10/10/nx-s1-5147815/i...,2024-10-10T13:11:09Z,One round of in vitro fertilization or IVF can...


In [24]:
df = pd.read_csv('Donald_Trump_articles.csv')
df['source'].value_counts()

source
AP NEWS      500
Forbes       500
Fox News     500
MSNBC.com    397
NPR          243
Name: count, dtype: int64

In [25]:
df1 = pd.read_csv('Abortion_articles.csv')
df1['source'].value_counts()

source
Fox News     203
AP NEWS      157
Forbes        70
MSNBC.com     60
NPR           43
Name: count, dtype: int64

In [None]:
# 1. Merge relevant dataframes - find relevant actors for each --> Mason
# 2. Data Cleaning/LDA --> Carson
# 2. Active/passive/topic voice code --> Timmy 
# 3. ChatGPT sentiment for actors in a given category (-100 to 100) --> Ethan/Timmy
# 4. Coming up with some sort of logical evaluation score metric based on active/passive voice, sentiment, title, and number of articles about certain subjects etc. --> Bethel/Medha (Come up with a few options)

In [12]:
# Combine related dataframes

# Read in csv's
# List of topics
topics = [
    "Donald Trump", "Kamala Harris", "Israel", "Palestine", "Palestinians", "Hamas", "FEMA", "Abortion",
    "Inflation", "Unemployment", "Economy", "Dockworkers", "ILA Port", "Immigration"
]

# Dictionary to hold the DataFrames after reading them from CSV
dataframes = {}

# Loop to read each CSV and store the DataFrame in the dictionary
for topic in topics:
    # Replace spaces with underscores to match your file naming convention
    file_name = f"{topic.replace(' ', '_')}_articles.csv"
    try:
        dataframes[topic.replace(' ', '_')] = pd.read_csv(file_name)
    except FileNotFoundError as e:
        print(f"Error: {e}")  # If a file is not found

# Merge Donald Trump and Kamala Harris
Donald_Trump_Kamala_Harris_df = pd.concat([dataframes["Donald_Trump"], dataframes["Kamala_Harris"]])

# Merge Israel, Palestine, Palestinians, and Hamas
Israel_Palestine_Palestinians_Hamas_df = pd.concat([
    dataframes["Israel"], dataframes["Palestine"], dataframes["Palestinians"], dataframes["Hamas"]
])

# Merge Inflation, Unemployment, and Economy
Inflation_Unemployment_Economy_df = pd.concat([
    dataframes["Inflation"], dataframes["Unemployment"], dataframes["Economy"]
])

# Merge Dockworkers and ILA Port
Dockworkers_ILA_Port_df = pd.concat([
    dataframes["Dockworkers"], dataframes["ILA_Port"]
])

print(Donald_Trump_Kamala_Harris_df.head())
print(Israel_Palestine_Palestinians_Hamas_df.head())
print(Inflation_Unemployment_Economy_df.head())
print(Dockworkers_ILA_Port_df.head())

                                               title source  author  \
0  The politics of natural disasters : The NPR Po...    NPR     NaN   
1  How The Internet, Social Media, And Podcasts A...    NPR     NaN   
2  Ethel Kennedy, social activist and widow of Ro...    NPR     NaN   
3  What matters to key swing state voters in Wisc...    NPR     NaN   
4  Politicians say health plans should cover IVF....    NPR     NaN   

                                                 url           publishedAt  \
0  https://www.npr.org/2024/10/10/1210938546/poli...  2024-10-10T21:01:48Z   
1  https://www.npr.org/2024/10/10/1210938545/1a-1...  2024-10-10T21:01:39Z   
2  https://www.npr.org/2024/10/10/nx-s1-5148861/e...  2024-10-10T15:53:45Z   
3  https://www.npr.org/2024/10/10/nx-s1-5143006/w...  2024-10-10T13:41:06Z   
4  https://www.npr.org/2024/10/10/nx-s1-5147815/i...  2024-10-10T13:11:09Z   

                                             content  
0  The politics of natural disasters : The NPR Po

In [None]:
# Load spaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract nouns and proper nouns
def extract_nouns(text):
    if pd.isnull(text):  # Handle missing values
        return []
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"] and token.is_alpha and len(token.text) > 2]
    return nouns

# Loop through all DataFrames
for topic, df in dataframes.items():
    print(f"Processing topic: {topic}")
    
    # Apply the noun extraction to both the 'title' and 'content' columns
    df['Title_Nouns'] = df['title'].apply(extract_nouns)
    df['Content_Nouns'] = df['content'].apply(extract_nouns)

    # Combine all noun lists into one for counting
    all_nouns = df['Title_Nouns'].sum() + df['Content_Nouns'].sum()  # Flatten lists so that the noun lists from both the Title_Nouns and Content_Nouns columns are in a single list
    
    # Calculate the value counts
    noun_counts = pd.Series(all_nouns).value_counts()
    
    # Output the top 10 most frequent nouns
    print(f"Top nouns for {topic}:\n", noun_counts.head(10))

    # Store the counts for further analysis
    dataframes[topic]['Noun_Counts'] = noun_counts

