In [26]:

import matplotlib.pyplot as plt
import pandas as pd

from bq.queries import run_query

In [None]:
# GDelt project
events = pd.DataFrame(
	run_query(
		"""
		select eventcode, nummentions, avgtone, goldsteinscale, sqldate
		from `gdelt-bq.gdeltv2.events` 
		order by rand()
		limit 500000
		""",
		debug=True
	)
)

# Remove rows with date older than 1970
events = events[events['sqldate'] > 19700000]

events.head(n=20)

In [47]:
# { event_code: event_name }
event_codes = pd.read_json('../data/cameo_mapping.json', orient='index')
event_codes = event_codes.rename(columns={0: 'description'})
event_codes = event_codes.reset_index().rename(columns={'index': 'eventcode'})

events['eventcode'] = events['eventcode'].astype(str)
event_codes['eventcode'] = event_codes['eventcode'].astype(str)

In [None]:
event_codes.head(n=20)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(events['nummentions'], bins=30, color='blue', alpha=0.7)
plt.xlabel('NumMentions')
plt.ylabel('Frequency')
plt.title('Histogram of NumMentions')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(events['avgtone'], bins=30, color='blue', alpha=0.7)
plt.xlabel('AvgTone')
plt.ylabel('Frequency')
plt.title('Histogram of avgtone')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(events['goldsteinscale'], bins=30, color='blue', alpha=0.7)
plt.xlabel('GoldsteinScale')
plt.ylabel('Frequency')
plt.title('Histogram of GoldsteinScale')
plt.show()

In [None]:
# Group by eventcode and sum the GoldsteinScale
events_grouped = events.groupby('eventcode')['goldsteinscale'].sum().reset_index()
events_grouped = events_grouped.sort_values('goldsteinscale', ascending=False)
events_grouped

In [None]:
events['eventcode'] = events['eventcode'].astype(str).str.lstrip('0')
# Merge with event_codes to get the description
events_grouped = events_grouped.merge(event_codes, on='eventcode', how='left')
# filter out events_grouped without Description
events_grouped = events_grouped[events_grouped['description'].notnull()]
events_grouped

In [None]:
# Sort by GoldsteinScale and get the top 10 events
top_events = events_grouped.nlargest(10, 'goldsteinscale')

# Plot the results
plt.figure(figsize=(12, 8))
plt.barh(top_events['description'], top_events['goldsteinscale'], color='skyblue')
plt.xlabel('Cumulative GoldsteinScale')
plt.title('Top 10 Events Occurence by Cumulative GoldsteinScale')
plt.gca().invert_yaxis()  # Invert y axis to have the highest value at the top
plt.grid(axis='x', linestyle='-.', alpha=0.3, which='minor')  # Add gridlines
plt.grid(axis='x', linestyle='-', alpha=0.5, which='major')  # Add gridlines
plt.minorticks_on()  # Add minor ticks
# Highlight the bars with the highest value
plt.gca().patches[0].set_facecolor('salmon')
plt.show()

In [None]:
# Show a plot showing 10 lines : one for each of the top 10 events each year
# Get the top 10 events more present in the dataset, do not use GoldsteinScale
top_events = events['eventcode'].value_counts().nlargest(10).index

# Filter events to include only top 10 event codes
filtered_events = events[events['eventcode'].isin(top_events)].copy()

# Extract year from SQLDATE
filtered_events['year'] = filtered_events['sqldate'].astype(str).str[:4].astype(int)

# Group by year and event code, then count occurrences
event_counts = filtered_events.groupby(['year', 'eventcode']).size().unstack(fill_value=0)

# Plot the data
plt.figure(figsize=(12, 8))
for event_code in top_events:
	# do not show number of events, but quantity of events
	label = event_codes[event_codes['eventcode'] == event_code]['description'].values[0]
	fraction = event_counts[event_code] / event_counts.sum(axis=1)
	plt.plot(event_counts.index, event_counts[event_code], label=label)

plt.xlabel('Year')
plt.ylabel('Number of Events')
plt.title('Top 10 Event Codes Over the Years')
plt.legend(title='Event Code')
plt.show()