In [2]:
import torch
import matplotlib.pyplot as plt
from openTSNE import TSNE
import pickle
from deepsnap.hetero_graph import HeteroGraph
import networkx as nx
from hetero_gnn import HeteroGNN
from train_gnn_llm import graph_tensors_to_device
from matplotlib.colors import Normalize
import numpy as np

# TSNE embedding analysis

In [3]:
train_args = {
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "hidden_size": 81,
    "epochs": 233,
    "weight_decay": 0.00002203762357664057,
    "lr": 0.003873757421883433,
    "attn_size": 48,
    "num_layers": 6,
    "aggr": "attn",
}


tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    verbose=True,
)

In [4]:
def plot_tsne(node_features, node_targets, emb):
    plt.figure(figsize=(12, 8))
    
    
    
    # norm = Normalize(vmin=node_targets.min(), vmax=node_targets.max())
    plt.scatter(emb[:, 0], emb[:, 1], c=node_targets, cmap='viridis', alpha=0.8, s=5, linewidths=0)

    # plt.scatter(emb[~mask, 0], emb[~mask, 1], 
    #         c='blue', alpha=0.8, s=5, linewidths=0, label="< 40 articles")
    # plt.scatter(emb[mask, 0], emb[mask, 1], 
    #         c='red', alpha=0.4, s=5, linewidths=0, label="40 or more articles")
    
    plt.title("t-SNE of Node Embeddings")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar()
    plt.show()

In [5]:
with open("./4_concepts_similar_llm.pkl", "rb") as f:
    G = pickle.load(f)

# Create a HeteroGraph object from the networkx graph
hetero_graph = HeteroGraph(G, netlib=nx, directed=True)
graph_tensors_to_device(hetero_graph)

model = HeteroGNN(
    hetero_graph,
    train_args,
    num_layers=train_args["num_layers"],
    aggr=train_args["aggr"],
    return_embedding=True,
).to(train_args["device"])

model.load_state_dict(torch.load("./best_model.pkl", map_location=torch.device(train_args['device'])))

preds = model(hetero_graph.node_feature, hetero_graph.edge_index)

node_embeddings = preds['event'].cpu().detach().numpy()
# Train and evaluate the t-SNE model
embedding_train = tsne.fit(node_embeddings)
pickle.dump(embedding_train, open("./tsne_model.pkl", "wb"))
embeddings_2d = embedding_train.transform(node_embeddings)

TYPE ('event', 'similar', 'event')
	 Feature 769
	 Feature 769
TYPE ('event', 'related', 'concept')
	 Feature 769
	 Feature 1
TYPE ('concept', 'related', 'event')
	 Feature 1
	 Feature 769
KEY ('event', 'similar', 'event') <class 'tuple'>
KEY NUMS ('event', 'similar', 'event') 33429 33429
MAX EDGES tensor(32647) tensor(32647) 33429 33429
KEY ('event', 'related', 'concept') <class 'tuple'>
KEY NUMS ('event', 'related', 'concept') 33429 22124
MAX EDGES tensor(32647) tensor(22123) 33429 22124
KEY ('concept', 'related', 'event') <class 'tuple'>
KEY NUMS ('concept', 'related', 'event') 22124 33429
MAX EDGES tensor(22123) tensor(32647) 22124 33429
--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_jobs=8, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 2.93 

In [None]:
node_features = hetero_graph.node_feature['event']
node_targets = hetero_graph.node_target['event'].flatten()

# Remove artifically added nodes
mask = node_targets != -1
node_features = node_features[mask]
node_targets = node_targets[mask]
embeddings = embeddings_2d[mask]

# log transform targets
node_targets = np.log(node_targets)

plot_tsne(node_features, node_targets, embeddings)

# Evalutation scores
* absolute, relative, only outliers performance
* review popular scores

# LLM vs noLLm
* graph x = min articles, y = performance