In [None]:
import pandas as pd
import numpy as np
import os
import sys

import matplotlib.pyplot as plt
import networkx as nx
from node2vec import Node2Vec
from sklearn.decomposition import PCA

sys.path.append("../")

from core.functions import normalize_actor, build_actor_interaction_graph, add_pairwise_embedding_features

In [None]:
data_path = "../data/raw/"

In [None]:
# read in the data and set the index as 'event_id_cnty'
df = pd.read_csv(os.path.join(data_path, 'ACLED Data_2026-01-02.csv'))

In [None]:
df = df.set_index('event_id_cnty')

In [None]:
df.to_parquet(os.path.join(data_path, 'acled_ukraine_data_2026_01_02.parquet'))

In [None]:
df.shape

In [None]:
df.head(4)

In [None]:
df.shape

In [None]:
df["actor1"].unique()

In [None]:
df["actor2"].unique()

In [None]:
df["actor1_root"] = df["actor1"].apply(normalize_actor)
df["actor2_root"] = df["actor2"].apply(normalize_actor)
df["actor2_missing"] = df["actor2"].isna().astype(int)


In [None]:
df["actor1_root"].value_counts().head(10)

In [None]:
# Construct a graph representing interactions between actors based on
# co-occurrences in events. This graph captures relationships and patterns
# among actors, which can be useful for network analysis and feature
# engineering in predictive modeling.

G = build_actor_interaction_graph(df)

## Network Embeddings with Node2Vec 

In [None]:
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G, seed=42)
nx.draw(
    G,
    pos,
    with_labels=True,
    node_size=800,
    font_size=8
)
plt.show()


In [None]:
edges = G.edges(data=True)
weights = [d["weight"] for _, _, d in edges]

plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G, seed=42, weight="weight")
nx.draw(
    G,
    pos,
    with_labels=True,
    node_size=800,
    font_size=8,
    width=weights
)
plt.show()


In [None]:
node2vec = Node2Vec(
    G,
    dimensions=32,
    walk_length=10,
    num_walks=50,
    workers=4,
    weight_key="weight",
    seed=42
)

model = node2vec.fit(
    window=5,
    min_count=1,
    batch_words=4
)


In [None]:
embeddings = {
    node: model.wv[node]
    for node in G.nodes()
}

emb_df = pd.DataFrame.from_dict(embeddings, orient="index")
emb_df.columns = [f"emb_{i}" for i in range(emb_df.shape[1])]


In [None]:
print(df.index.unique().shape)
print(df.shape)

In [None]:
emb_df.head()

In [None]:
emb_df.shape

## PCA Visualization

In [None]:
X = emb_df.values
actors = emb_df.index

pca = PCA(n_components=2)
X2 = pca.fit_transform(X)

plt.figure(figsize=(8, 6))
plt.scatter(X2[:, 0], X2[:, 1])

for i, a in enumerate(actors):
    plt.text(X2[i, 0], X2[i, 1], a, fontsize=8)

plt.title("Actor Embedding PCA")
plt.show()


## Actor 1 Embeddings Merge

In [None]:
df = df.merge(
    emb_df,
    left_on="actor1_root",
    right_index=True,
    how="left"
)


## Actor 2 Embeddings Merge

In [None]:
df = df.merge(
    emb_df.add_prefix("a2_"),
    left_on="actor2_root",
    right_index=True,
    how="left"
)


In [None]:
df.columns.to_list()

In [None]:
df["actor2_missing"]

In [None]:
df["emb_1"]

In [None]:
df["a2_emb_1"]

In [None]:
df.shape

In [None]:
df.filter(regex="^emb_").shape

In [None]:
df.filter(regex="^a2_emb_").shape

In [None]:
# Add pairwise embedding features based on the normalized actor names. These features
# capture the relationships between pairs of actors involved in events,
# providing additional context and information for downstream analysis
# and modeling tasks.
#############################################################################

df = add_pairwise_embedding_features(df=df)