In [1]:
data_path = "/home/jovyan/shared/David/gdelt_study_data/20250106/part-00000-44b31ecc-d942-4aaf-a6cd-172c5ad7b04d-c000.csv"
# shared/David/gdelt_study_data/20250106/part-00000-44b31ecc-d942-4aaf-a6cd-172c5ad7b04d-c000.csv

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option('display.max_rows', None) 


#parameter for master can be changed depending on the environment
spark = SparkSession.builder \
 .master("local") \
 .appName("GDELT-study") \
 .getOrCreate()

In [3]:
df = spark.read.csv(data_path, header=True)
df.show(3,vertical=True,truncate=False )

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------
 Cameo                 | MAKE PUBLIC STATEMENT                                                                                                                  
 Cameo_full            | Acknowledge or claim responsibility                                                                                                    
 GLOBALEVENTID         | 1218993708                                                                                                                             
 SQLDATE               | 20241230                                                                                                                               
 Actor1Name            | ISLAMIC                                                                                                                                
 Actor1Geo_FullName    | Jabalia, 

In [4]:
# assume `df` is your filtered Spark DataFrame
df = (df
      # Parse SQLDATE (YYYYMMDD string) into true DateType
      .withColumn("date", to_date(col("SQLDATE"), "yyyyMMdd"))
      # Cast numeric fields
      .withColumn("AvgTone", col("AvgTone").cast("double"))
      .withColumn("NumMentions", col("NumMentions").cast("int"))
      .withColumn("GoldsteinScale", col("GoldsteinScale").cast("double"))
     )

NameError: name 'to_date' is not defined

In [None]:
df.show(3,vertical=True,truncate=False )

In [None]:
# Show them in the console, sorted
df.select("SQLDATE") \
  .distinct() \
  .orderBy("SQLDATE") \
  .show(truncate=False)


# 1. Event-Type Time Series

In [5]:
# 1) Count events per day × EventBaseCode
ts_spark = (df
    .groupBy("date", "EventBaseCode")
    .agg(count("*").alias("cnt")))

# 2) Pivot to wide format, fill missing with zero
ts_wide = (ts_spark
    .groupBy("date")
    .pivot("EventBaseCode")
    .agg(_sum("cnt"))
    .na.fill(0)
    .orderBy("date"))

# 3) Collect to Pandas for plotting
ts_pd = ts_wide.toPandas().set_index("date")

# 4) Plot
plt.figure(figsize=(12,6))
for code in ts_pd.columns:
    plt.plot(ts_pd.index, ts_pd[code], label=code)
plt.legend(title="BaseCode")
plt.title("Daily Events by CAMEO BaseCode")
plt.ylabel("Count")
plt.xlabel("Date")
plt.tight_layout()
plt.show()


NameError: name 'count' is not defined

In [None]:
#2. Geospatial Heatmap (Country-Level)

In [None]:
!pip install geodatasets


In [None]:
geo_pd = df.select("ActionGeo_CountryCode").groupby("ActionGeo_CountryCode").count().toPandas()
geo_pd = geo_pd.rename(columns={"count": "cnt"})


In [None]:
import pycountry

def alpha2_to_alpha3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except:
        return None

geo_pd["ActionGeo_CountryCode"] = geo_pd["ActionGeo_CountryCode"].apply(alpha2_to_alpha3)


In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

# Path to shapefile
shapefile_path = "naturalearth_lowres/ne_110m_admin_0_countries.shp"

# Read shapefile
world = gpd.read_file(shapefile_path)

# Merge
gdf = world.merge(geo_pd, left_on="SOV_A3", right_on="ActionGeo_CountryCode", how="right")

# Plot
ax = gdf.plot(column="cnt", cmap="OrRd", legend=True, figsize=(12,8), edgecolor="black")
ax.set_title("Event Volume by Country", fontsize=15)
ax.set_axis_off()
plt.show()


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
world.boundary.plot(ax=ax, linewidth=1, color="black")
gdf.plot(column="cnt", cmap="OrRd", legend=True, ax=ax, edgecolor="black")
ax.set_facecolor('lightblue')
ax.set_title("Event Volume by Country", fontsize=18)
ax.set_axis_off()
plt.show()


In [None]:
# 3. Actor-Network Analysis

In [None]:
# build edge list in Spark
edges_spark = (df
  .groupBy("Actor1Name","Actor2Name")
  .agg(count("*").alias("weight"))
  .filter(col("Actor1Name").isNotNull() & col("Actor2Name").isNotNull())
)

# collect to pandas for NetworkX
edges_pd = edges_spark.toPandas()
edges_pd.head(5)

import networkx as nx
G = nx.from_pandas_edgelist(edges_pd, "Actor1Name", "Actor2Name", edge_attr="weight", create_using=nx.DiGraph())

pos = nx.spring_layout(G, k=0.5, seed=42)
plt.figure(figsize=(12,12))
nx.draw_networkx_nodes(G, pos,
                       node_size=[200 + 20*deg for _,deg in G.degree()])
nx.draw_networkx_edges(G, pos,
                       width=[d["weight"]/edges_pd.weight.max()*5 for u,v,d in G.edges(data=True)],
                       arrowstyle="->", arrowsize=8)
nx.draw_networkx_labels(G, pos, font_size=8)
plt.title("Actor Interaction Network")
plt.axis("off")
plt.show()


In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# 1) Compute degree centrality
deg = nx.degree_centrality(G)
# 2) Select top 20 actors
top20 = sorted(deg, key=deg.get, reverse=True)[:20]
# 3) Build the subgraph
subG = G.subgraph(top20)

# 4) Draw it
pos = nx.spring_layout(subG, seed=42)
plt.figure(figsize=(8,8))
nx.draw_networkx_nodes(subG, pos, node_size=500, node_color='skyblue')
nx.draw_networkx_edges(subG, pos, arrowsize=10, width=1)
nx.draw_networkx_labels(subG, pos, font_size=9)
plt.title("Subgraph of Top 20 Actors by Degree Centrality")
plt.axis('off')
plt.show()


In [6]:
# e.g. keep only edges with weight ≥ 10
heavy_edges = [(u,v) for u,v,d in G.edges(data=True) if d['weight']>=30]
H = G.edge_subgraph(heavy_edges).copy()

pos = nx.spring_layout(H, seed=42)
plt.figure(figsize=(8,8))
nx.draw(H, pos, with_labels=True, node_size=300, font_size=8)
plt.title("Actor Network: Only Edges ≥ 10 Events")
plt.show()


NameError: name 'G' is not defined

In [None]:
!pip install community

In [None]:
from networkx.algorithms.community import greedy_modularity_communities

# Pick a subgraph to keep it small, e.g. top 50 by degree:
deg = nx.degree_centrality(G)
top50 = sorted(deg, key=deg.get, reverse=True)[:50]
subG = G.subgraph(top50)

# Find communities
communities = list(greedy_modularity_communities(subG))

# Build a node→communityID mapping
partition = {}
for i, comm in enumerate(communities):
    for node in comm:
        partition[node] = i

# Now you can color by partition exactly as before:
pos = nx.spring_layout(subG, seed=42)
plt.figure(figsize=(8,8))
colors = [partition[n] for n in subG.nodes()]
nx.draw_networkx(subG, pos,
                 node_color=colors,
                 cmap=plt.cm.tab20,
                 node_size=300,
                 font_size=8,
                 arrows=False)
plt.title("Top 50 Actors, Colored by Community (NetworkX greedy_modularity)")
plt.axis("off")
plt.show()


In [None]:
!pip install pyvis

In [None]:
from pyvis.network import Network

net = Network(height="750px", width="100%", notebook=True)
net.from_nx(subG)   # or from any filtered subgraph
net.show("actor_network.html")


In [None]:
import pandas as pd

# edges_pd is your pandas edge list
top_pairs = edges_pd.sort_values("weight", ascending=False).head(20)
print(top_pairs)


In [None]:
# 4. Tone & Sentiment Trends

In [None]:
# compute daily average tone then rolling mean in pandas
tone_spark = (df
  .groupBy("date")
  .agg(mean("AvgTone").alias("avg_tone"))
  .orderBy("date")
)
tone_pd = tone_spark.toPandas().set_index("date")
tone_pd["rolling7"] = tone_pd["avg_tone"].rolling(7, center=True).mean()

# plot
plt.figure(figsize=(10,5))
plt.plot(tone_pd.index, tone_pd["rolling7"])
plt.title("7-Day Rolling Average Tone")
plt.ylabel("AvgTone")
plt.xlabel("Date")
plt.tight_layout()
plt.show()


In [None]:
# 5. Theme & Keyword Co-Occurrence (with GKG join)

In [None]:
# assume you loaded GKG as spark_df_gkg with columns GLOBALEVENTID, V2Themes
joined = df.join(spark_df_gkg.select("GLOBALEVENTID","V2Themes"),
                 "GLOBALEVENTID", "inner")

# split themes and explode
from pyspark.sql.functions import split, explode
themes_exploded = joined.withColumn("theme", explode(split(col("V2Themes"),";")))

# build co-occurrence matrix via DataFrame then to pandas
pairs = (themes_exploded
  .select("GLOBALEVENTID","theme")
  .distinct()
)

# create theme-event pivot
pivot = (pairs.groupBy("theme")
         .pivot("GLOBALEVENTID")
         .agg(_sum(col("theme").isNotNull().cast("int")))
         .na.fill(0))

cooc_pd = pivot.toPandas().set_index("theme")
cooc_mat = cooc_pd.dot(cooc_pd.T)

# plot top-10 heatmap
top10 = cooc_mat.sum(axis=1).nlargest(10).index
import seaborn as sns
plt.figure(figsize=(8,6))
sns.heatmap(cooc_mat.loc[top10, top10], annot=True, fmt="d", cmap="Blues")
plt.title("Theme Co-occurrence (Top 10)")
plt.show()


In [None]:
# 6. Burst Detection & Change-Point Analysis

In [None]:
!pip install ruptures

In [None]:
# get daily counts of a given base code, e.g. '18'
signal_spark = (df.filter(col("EventBaseCode")=="18")
  .groupBy("date")
  .agg(count("*").alias("cnt"))
  .orderBy("date")
)
signal_pd = signal_spark.toPandas().set_index("date")["cnt"].reindex(tone_pd.index, fill_value=0)

import ruptures as rpt
signal = signal_pd.values
algo = rpt.Pelt(model="rbf").fit(signal)
change_points = algo.predict(pen=3)

plt.figure(figsize=(10,4))
plt.plot(signal_pd.index, signal, label="Daily Violent Clashes")
for cp in change_points:
    plt.axvline(signal_pd.index[cp-1], color="r", ls="--")
plt.title("Change-Points in Violent Clashes")
plt.legend()
plt.show()


In [None]:
# 7. Comparative Coverage Intensity

In [None]:
# assume all_df is the full GDELT year DataFrame with same date col
total_spark = df.groupBy("date").agg(count("*").alias("total"))
isp_spark   = df.groupBy("date").agg(count("*").alias("isp"))

share_spark = isp_spark.join(total_spark, "date") \
                       .withColumn("share", col("isp")/col("total")) \
                       .orderBy("date")

share_pd = share_spark.toPandas().set_index("date")["share"]

plt.figure(figsize=(10,5))
plt.plot(share_pd.index, share_pd)
plt.title("Share of GDELT Coverage: Israel–Palestine")
plt.ylabel("Proportion")
plt.xlabel("Date")
plt.tight_layout()
plt.show()


In [None]:
# 8. Predictive Modeling (Prophet)

In [None]:
!pip install prophet

In [None]:
from prophet import Prophet

# daily counts of protests (base 14)
protest_spark = (df.filter(col("EventBaseCode")=="14")
  .groupBy("date")
  .agg(count("*").alias("y"))
  .withColumnRenamed("date","ds")
  .orderBy("ds")
)
prophet_pd = protest_spark.toPandas()

m = Prophet()
m.fit(prophet_pd)
future = m.make_future_dataframe(periods=30)
forecast = m.predict(future)

fig = m.plot(forecast)
fig.suptitle("Forecast of Daily Protests (CAMEO 14)")
plt.show()


In [None]:
# 9. Sentiment by Region

In [None]:
region_tone = (df
    .groupBy("ActionGeo_CountryCode")
    .agg(mean("AvgTone").alias("mean_tone"))
    .orderBy("mean_tone")
)
rt_pd = region_tone.toPandas().set_index("ActionGeo_CountryCode")["mean_tone"]

plt.figure(figsize=(8,4))
rt_pd.plot(kind="bar")
plt.title("Mean Tone by Country")
plt.ylabel("AvgTone")
plt.xlabel("Country Code")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# 10. Casualty & Damage Proxy Analysis

In [None]:
from pyspark.sql.functions import col, lower, when, to_date, count, sum as _sum
import matplotlib.pyplot as plt

# 1) Ensure you have the date column
df = df.withColumn("date", to_date(col("SQLDATE"), "yyyyMMdd"))

# 2) Define proxy flags based on keywords in Cameo_full
df_proxy = (
    df
    .withColumn("killed",  when(lower(col("Cameo_full")).rlike(r"kill|assassin|execute"), 1).otherwise(0))
    .withColumn("wounded", when(lower(col("Cameo_full")).rlike(r"wound|injur"), 1).otherwise(0))
    .withColumn("damage",  when(lower(col("Cameo_full")).rlike(r"destroy|damage|attack"), 1).otherwise(0))
)

# 3) Aggregate daily sums of each proxy
daily_proxy_spark = (
    df_proxy
      .groupBy("date")
      .agg(
        _sum("killed").alias("killed"),
        _sum("wounded").alias("wounded"),
        _sum("damage").alias("damage")
      )
      .orderBy("date")
)

# 4) Convert to Pandas for plotting
daily_proxy_pd = daily_proxy_spark.toPandas().set_index("date")

# 5) Plot time series of the three proxies
plt.figure(figsize=(12,6))
for colname in ["killed", "wounded", "damage"]:
    plt.plot(daily_proxy_pd.index, daily_proxy_pd[colname], label=colname.capitalize())
plt.title("Daily Proxy Counts for Casualties & Damage")
plt.xlabel("Date")
plt.ylabel("Count of Events with Key Term")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# count all “kill” events (CAMEO codes ending in “91”) …
kills = df.filter(col("EventCode").endswith("91"))
# … broken out by actor-pair direction:
kills_dir = (kills
  .groupBy("Actor1Name","Actor2Name")
  .agg(count("*").alias("num_kills")))


In [None]:
kills_dir.show()

In [None]:
tone_dir = (df
  .groupBy("Actor1Name","Actor2Name")
  .agg(
    avg("AvgTone").alias("mean_tone"),
    count("*").alias("n_events")
  ))
# then filter for the two directions you care about
tone_dir.show()

In [None]:
from pyspark.sql.functions import col, avg, count, regexp_extract
import matplotlib.pyplot as plt

# 1) Kill events only
kills = df.filter(col("EventCode").endswith("91"))

# 2) Extract domain & map to region
kills = kills.withColumn(
    "domain",
    regexp_extract(col("SOURCEURL"), r"https?://(?:www\.)?([^/]+)", 1)
)
domain_region_list = [
    ("nytimes.com",   "US"),
    ("washingtonpost.com", "US"),
    ("bbc.co.uk",     "EU"),
    ("lemonde.fr",    "EU"),
    ("aljazeera.com", "Mideast"),
    ("haaretz.com",   "Mideast"),
    # …add more…
]
region_df = spark.createDataFrame(domain_region_list, ["domain","region"])
kills = kills.join(region_df, on="domain", how="left") \
             .fillna({"region":"Other"})

# 3) Aggregate per region & actor‐direction
bias = (
    kills
     .groupBy("region","Actor1Name","Actor2Name")
     .agg(
        count("*").alias("n_kills"),
        avg("AvgTone").alias("mean_tone")
     )
)

# 4) Filter the two directions
israel_v_pal = bias.filter(
     col("Actor1Name").contains("ISRAEL") &
     col("Actor2Name").contains("PALESTINIAN")
).select("region","n_kills","mean_tone")
pal_v_israel = bias.filter(
     col("Actor1Name").contains("PALESTINIAN") &
     col("Actor2Name").contains("ISRAEL")
).select("region","n_kills","mean_tone")

# 5) Bring in all possible regions so no plot is empty
all_regions = [r.region for r in region_df.select("region").distinct().collect()] + ["Other"]

ivp_pd = israel_v_pal.toPandas().set_index("region").reindex(all_regions, fill_value=0)
pvi_pd = pal_v_israel.toPandas().set_index("region").reindex(all_regions, fill_value=0)

# 6) Plot
fig, axes = plt.subplots(2, 2, figsize=(12,8))

ivp_pd["n_kills"].plot(kind="bar", ax=axes[0,0], title="Israel→Pal Kill Counts by Region")
axes[0,0].set_ylabel("Count")

pvi_pd["n_kills"].plot(kind="bar", ax=axes[0,1], title="Pal→Isr Kill Counts by Region")
axes[0,1].set_ylabel("Count")

ivp_pd["mean_tone"].plot(kind="bar", ax=axes[1,0], title="Israel→Pal AvgTone by Region")
axes[1,0].set_ylabel("AvgTone")

pvi_pd["mean_tone"].plot(kind="bar", ax=axes[1,1], title="Pal→Isr AvgTone by Region")
axes[1,1].set_ylabel("AvgTone")

plt.tight_layout()
plt.show()
