In [None]:
!pip3 install pandas
%reset
import time
import json
import requests as r
import pandas as pd
from io import StringIO
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from config import API_KEY

In [None]:
#1A and 1B.

#Establish parameters to be used
#query = '' # scans through articles for this keyword
fq = 'section_name:("Health")' # Filters for articles under this section
begin_date = '20180101'
end_date = '20231231'

# Note - NYtimes API has a pagination limit of 10 results per page - so loop through pages and combine information into dataframe
# Note - NYtimes API has a call limit of 500 requests per day and 5 requests per minute - so recommended to use sleep 12 seconds to avoid hitting the minute rate limit

# Create an empty datatframe 
articles_df = pd.DataFrame()



# Iterate through 0-X pages(5 in this case) with each page containg 10 articles (Index 0 -9)
for pages in range(5):
    url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={fq}&page={pages}&begin_date={begin_date}&end_date={end_date}&api-key={API_KEY}"
    response = r.get(url).content.decode()
    response_json = json.loads(response)

    pages_df = pd.json_normalize(response_json['response']['docs'])
    articles_df = pd.concat([articles_df, pages_df], ignore_index=True)

    # Note: For any range higher than 5 it is recommended to use time.sleep(12) to avoid the call limit rate
    #time.sleep(12) # Causes the cell to take a long moment before generating output


    # TO error check when dataframe is not being generated
    #if 'response' in response_json and 'docs' in response_json['response']:
        #pages_df = pd.json_normalize(response_json['response']['docs'])
        #articles_df = pd.concat([articles_df, pages_df], ignore_index=True)
        #time.sleep(12)
    #else:
        #print(f"Page {pages} did not return the expected data.")

articles_df

In [None]:
#2A.
#keywords_series = articles_df['keywords'].explode()
#keywords_count = keywords_series.value_counts()
#keywords_count = keywords_count.reset_index()

#keywords_count
# keywords column includes various information that I don't want

In [None]:
#2A.

#Create a loop that will take the value of the associated value key and eventually add it back to dataframe in a new column

# Empty list to hold values of all rows
keyword_values = []

# Iterate/loop over each row in the 'keywords' column
for keywords_list in articles_df['keywords']:
    # Initialize an empty list to hold the keyword values for the iterated row 
    row_values = []
    # Iterate/loop over each dictionary in the current list of keywords
    for keyword_dict in keywords_list:  
        if 'value' in keyword_dict:
            # Add the value associated with the 'value' key to the row_values list
            row_values.append(keyword_dict['value'])
    # Adds the list of keyword values for this row to the keyword_values list
    keyword_values.append(row_values)

# Assign the list of keyword values back to the DataFrame in a new column
articles_df['keyword_values'] = keyword_values

# exploded 'keyword_values' column creates a row for each keyword string
exploded_keywords = articles_df['keyword_values'].explode()

# Count occurrences of each keyword
keywords_count = exploded_keywords.value_counts()

keywords_count = keywords_count.reset_index()


print(keywords_count.head(10))

In [None]:
#2B.

articles_df['pub_date'] = pd.to_datetime(articles_df['pub_date']) # This line is needed since yo uneed datetimeline values for .dt

# Extract the year from 'pub_date' and put in new column
articles_df['year'] = articles_df['pub_date'].dt.year

# Creates a DataFrame that includes both the year and the exploded keyword values
keywords_by_year_df = articles_df.explode('keyword_values')[['year', 'keyword_values']]

# Group by both 'year' and 'keyword_values'
#.size()  returns a pandas series that possess the total number of row count for each group
keywords_frequency_over_time = keywords_by_year_df.groupby(['year', 'keyword_values']).size().reset_index(name='count')

# Sort the DataFrame by keyword and year
keywords_frequency_over_time = keywords_frequency_over_time.sort_values(by=['keyword_values', 'year'])

keywords_frequency_over_time

In [None]:
#2B.
# Filters dataframe to look through keywrod_values and their count over time 
user_input = input("Which keyword would you like to analyze")
keyword_trend = keywords_frequency_over_time[keywords_frequency_over_time['keyword_values'] == user_input]

keyword_trend

#For example: Coronavirus (2019-nCoV) had a count of 17 counts in 2021 when the pandemic may have been at its peak, but it's gone down since
#Note: Only analyzes from the first 10 pages

In [None]:
#3A.

# Bar Chart
sns.set_style("whitegrid")

# Set variable as top 10 keywords for data in graph
top_keywords = keywords_count.head(10)

plt.figure(figsize=(10, 6)) # Width of 10 inch and 6 inch
#plt.xticks(range(min(top_keywords['count']),max(top_keywords['count'])))
sns.barplot(x='count', y='keyword_values', data = top_keywords, errorbar = None)
#

plt.title('Keyword Frequencies')
plt.xlabel('Frequency')
plt.ylabel('Keywords')

plt.show()

In [None]:
#3B 
# make sure 'pub_date' is in datetime format
articles_df['pub_date'] = pd.to_datetime(articles_df['pub_date'])

# Filter the DataFrame for rows where the year is 2018
articles_2018 = articles_df[articles_df['pub_date'].dt.year == 2018]

# Resample to weekly counts, using the start of each week
# https://stackoverflow.com/questions/14530556/resample-time-series-in-pandas-to-a-weekly-interval
# https://towardsdatascience.com/resample-function-of-pandas-79b17ec82a78
articles_per_week_2018 = articles_2018.resample('W', on='pub_date').size().reset_index(name='count')

# Plotting the weekly trend for 2018
plt.figure(figsize=(14, 6)) 
sns.lineplot(data=articles_per_week_2018, x='pub_date', y='count', marker='o')
plt.xlabel('Week')
plt.ylabel('Number of Articles')
plt.title('Weekly Number of Articles Published in 2018')

# Formats the x-axis ticks to the start of each month and to display the month and year, 
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))

plt.xticks(rotation=45) # Rotating dates for better readability

plt.tight_layout() # Adjust layout to make room for the rotated date labels

plt.show()


In [None]:
#3
# The plot is overly crowded when put together, so use loop that will generate a plot for each year
for year in range(2018, 2024):
    
    articles_year = articles_df[articles_df['pub_date'].dt.year == year]
    
    #
    articles_per_week = articles_year.resample('W', on='pub_date').size().reset_index(name='count')
    
    sns.set_style("whitegrid")
    plt.figure(figsize=(14, 6))  
    sns.lineplot(data=articles_per_week, x='pub_date', y='count', marker='o')
    plt.xlabel('Week')
    plt.ylabel('Number of Articles')
    plt.title(f'Weekly Number of Articles Published in {year}')
    
    
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
    
    plt.xticks(rotation=45) # Rotating dates for better readability
    
    plt.tight_layout() # Adjust layout to make room for the rotated date labels
    
    plt.show()