Another lil script to visualize publications by months (to use to know which months would be good to analyze)

In [3]:
import pandas as pd
from dateutil import parser
from datetime import datetime # for converting and using dates
import pytz

# Load data
articles = pd.read_csv("../../data/processed/gbh_geocoded_output.csv")

# Drop rows without neighborhoods
articles.dropna(subset=["Neighborhood"], inplace=True)

# Dictionary to map timezone names to pytz time zones
tzinfos = {'EST': pytz.timezone('US/Eastern'), 'EDT': pytz.timezone('US/Eastern')}

# Function to convert date strings to datetime objects
def convert_to_datetime(date_str):
    return parser.parse(date_str, tzinfos=tzinfos)

# Apply the conversion function to the 'Publication Date' column
articles['Publication Date'] = articles['Publication Date'].apply(convert_to_datetime)

# Ensure the Publication Date column is timezone-aware
articles['Publication Date'] = articles['Publication Date'].dt.tz_convert(pytz.timezone('US/Eastern'))


In [4]:
# Define the date range (example)
start_date = pytz.timezone('US/Eastern').localize(datetime(2002, 12, 18))
end_date = pytz.timezone('US/Eastern').localize(datetime(2024, 12, 18))

# Filter the DataFrame for the date range
filtered_articles = articles[(articles['Publication Date'] >= start_date) & (articles['Publication Date'] <= end_date)]

print(f"Using articles from {start_date.date()} to {end_date.date()}")
print(f"Number of articles: {len(filtered_articles)}")

Using articles from 2002-12-18 to 2024-12-18
Number of articles: 925


In [5]:
# Extract year and month from the 'Publication Date' column
filtered_articles['Year-Month'] = filtered_articles['Publication Date'].dt.to_period('M')

# Count articles by year-month
articles_by_month = filtered_articles['Year-Month'].value_counts().sort_index()

# Get top 10 months with the most articles
top_10_months = articles_by_month.nlargest(10)

# Convert PeriodIndex to string for better readability in the table
top_10_months.index = top_10_months.index.astype(str)

# Create a DataFrame to display the top 10 months
top_10_months_df = top_10_months.reset_index()
top_10_months_df.columns = ['Year-Month', 'Number of Articles']

print("Top 10 months with most articles published:")
print(top_10_months_df)

Top 10 months with most articles published:
  Year-Month  Number of Articles
0    2023-03                  98
1    2023-05                  79
2    2024-03                  79
3    2023-07                  78
4    2023-04                  71
5    2024-04                  71
6    2024-02                  61
7    2021-01                  51
8    2023-10                  51
9    2023-11                  44


  filtered_articles['Year-Month'] = filtered_articles['Publication Date'].dt.to_period('M')
