In [4]:
%load_ext autoreload
%autoreload 2

import torch
import torch_geometric
import wandb
import pandas as pd
import torch.nn.functional as F

import numpy as np
from functools import reduce
from pathlib import Path

from torch_geometric.nn.models import GCN
from models.DGI import DeepGraphInfomax
from ogb.nodeproppred import PygNodePropPredDataset
from tqdm import tqdm
from torch_geometric.utils import to_undirected, to_networkx
#from OpenGraphCon import OpenGraph
from open_dataset import load_dataset,load_amazon_datasets, load_plentoid, load_folds, create_class_folds, load_folds_class_variation
from torch_geometric.utils import one_hot, spmm
from torch_geometric.datasets import Planetoid, Amazon, Reddit2
from functools import partial
from open_dataset import load_reddit2,load_folds
from torch_geometric.utils import homophily
from sklearn.cluster import SpectralClustering
from scipy.sparse import csr_matrix
from utils import page_rank
import matplotlib.pyplot as plt
from models.OpenGCN import OpenGCN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
x = torch.Tensor([0.1, 0.1, 0.1, 0.1])
y = x/x.norm()
y.mean()

tensor(0.5000)

In [6]:
datasets = load_folds("photo", unknown_class_ratio=0.2)
for data in datasets:
    print("-----------")
    print(data.y.unique())
    print(data.y[data.labeled_mask].unique())
    print(data.y[data.unlabeled_mask].unique())

-----------
tensor([0, 1, 2, 3, 4, 5, 6, 7])
tensor([1, 2, 3, 7])
tensor([0, 1, 2, 3, 4, 5, 6, 7])
-----------
tensor([0, 1, 2, 3, 4, 5, 6, 7])
tensor([3, 5, 6, 7])
tensor([0, 1, 2, 3, 4, 5, 6, 7])
-----------
tensor([0, 1, 2, 3, 4, 5, 6, 7])
tensor([0, 4, 5, 6])
tensor([0, 1, 2, 3, 4, 5, 6, 7])
-----------
tensor([0, 1, 2, 3, 4, 5, 6, 7])
tensor([0, 1, 2, 4])
tensor([0, 1, 2, 3, 4, 5, 6, 7])


In [7]:
device = torch.device(*('cuda', 1) if torch.cuda.is_available() else 'cpu')
datasets = load_folds("ogb-arxiv", unknown_class_ratio=0.2, fixed=False)
data = datasets[0]
data.x.shape

torch.Size([169343, 128])

In [8]:
data.y[data.all_class_val_mask].unique().shape

torch.Size([32])

In [9]:
data.y[data.labeled_mask].shape

torch.Size([69689])

In [10]:
data.train_mask.sum()

tensor(90941)

In [21]:
path = Path("test.pt")
torch.save(data, path)

In [22]:
torch.load(path)

Data(num_nodes=169343, edge_index=[2, 2315598], x=[169343, 128], node_year=[169343], y=[169343], train_mask=[169343], val_mask=[169343], test_mask=[169343], page_rank=[169343], classes=[40], known_classes=[24], val_classes=[8], test_classes=[8], unknown_classes=[16], known_class_mask=[169343], labeled_mask=[169343], val_class_mask=[169343], known_class_val_mask=[169343], unknown_class_val_mask=[169343], all_class_val_mask=[169343], test_class_mask=[169343], known_class_test_mask=[169343], unknown_class_test_mask=[169343], all_class_test_mask=[169343], unlabeled_mask=[169343])

In [23]:
print(data.known_classes)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7, 10, 11, 12, 15, 16, 19, 21, 22, 23, 24,
        27, 28, 29, 30, 34, 37])


In [24]:
model =  OpenGCN(in_channels = data.x.shape[1], 
                          hidden_channels = 128,
                          out_dim =  data.classes.size(0),
                          dropout = 0.7, 
                          num_layers = 2, 
                          known_classes = data.known_classes, 
                          unknown_classes = data.unknown_classes, 
                          device = device,
                          log_all = False)

In [25]:
import random

u_num = data.y[data.unlabeled_mask].shape[0]

print('extracting features for unlabeld data')
u_targets = data.y[data.unlabeled_mask].detach().numpy()
u_feats = model(data.x, data.edge_index)[data.unlabeled_mask, :].detach().numpy()

cand_k = np.arange(max_cand_k)

l_num = data.y[data.labeled_mask].shape[0]
l_targets = data.y[data.labeled_mask].detach().numpy()
l_feats = model(data.x, data.edge_index)[data.labeled_mask, :].detach().numpy()
print('extracting features for labeld data')

l_classes = data.known_classes.tolist()
num_lt_cls = int(round(len(l_classes)*split_ratio))
lt_classes = set(random.sample(l_classes, num_lt_cls)) #random sample 5 classes from all labeled classes
lv_classes = set(l_classes) - lt_classes

print(lt_classes)
print(lv_classes)

lt_feats = np.empty((0, l_feats.shape[1]))
lt_targets = np.empty(0)
for c in lt_classes:
    lt_feats = np.vstack((lt_feats, l_feats[l_targets==c]))
    lt_targets = np.append(lt_targets, l_targets[l_targets==c])

lv_feats = np.empty((0, l_feats.shape[1]))
lv_targets = np.empty(0, dtype=np.int64)
for c in lv_classes:
    lv_feats = np.vstack((lv_feats, l_feats[l_targets==c]))
    lv_targets = np.append(lv_targets, l_targets[l_targets==c])


cvi_list = np.zeros(len(cand_k))
acc_list = np.zeros(len(cand_k))
cat_pred_list = np.zeros([len(cand_k),u_num+l_num])
print('estimating K ...')
for i in range(len(cand_k)):
    cvi_list[i],  cat_pred_i = labeled_val_fun(np.concatenate((lv_feats, u_feats)), lt_feats, lt_targets, cand_k[i]+num_val_cls)
    cat_pred_list[i, :] = cat_pred_i
    acc_list[i] = cluster_acc(lv_targets, cat_pred_i[len(lt_targets): len(lt_targets)+len(lv_targets)])
    best_k = get_best_k(cvi_list[:i+1], acc_list[:i+1], cat_pred_list[:i+1], l_num) 
    print('current best K {}'.format(best_k))

kmeans = KMeans(n_clusters=best_k)
u_pred = kmeans.fit_predict(u_feats).astype(np.int32) 
acc, nmi, ari = cluster_acc(u_targets, u_pred), nmi_score(u_targets, u_pred), ari_score(u_targets, u_pred)
print('Final K {}, acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(best_k, acc, nmi, ari))
best_k

extracting features for unlabeld data
extracting features for labeld data
{1, 34, 5, 15, 27, 28}
{0, 2, 3, 4, 37, 6, 7, 10, 11, 12, 16, 19, 21, 22, 23, 24, 29, 30}
estimating K ...


KeyboardInterrupt: 

In [None]:
path = Path("pown_runs_all/ogb-arxiv.csv")
df = pd.read_csv(path)
df.sem()

In [None]:
datasets = list(load_folds_class_variation("photo", unknown_class_ratio=0.2))
datasets[0]

In [None]:
for data in datasets:
    print(data.known_classes)

In [None]:
data = load_amazon_datasets("photo", train_portion=0.6, val_portion=0.2, test_portion=0.2, seed=0)
data

In [None]:
edge_index = data.edge_index[0]
edge_index.shape
edge_index.max()
data.y[edge_index].shape
train_mask_edges = data.train_mask[edge_index]
train_mask_edges.sum()

In [None]:
classes = torch.unique(data.y)
n_classes = len(classes)
indices = torch.randperm(n_classes)
# Use the indices to shuffle the tensor
classes = classes[indices]
classes

In [None]:
def create_class_folds(data, unknown_class_ratio):
    classes = torch.unique(data.y)
    n_classes = len(classes)
    
    # Generate a random permutation of indices
    indices = torch.randperm(n_classes)
    
    # Use the indices to shuffle the tensor
    classes = classes[indices]
    fold_length =int(max(unknown_class_ratio * n_classes, 2))
    
    # Split the tensor into equal-sized folds using a loop
    folds = [classes[i:i+fold_length] for i in range(0, n_classes, fold_length)]

    if len(folds[-1])==1:
        combined_fold = torch.cat((folds[-2], folds[-1]))
        # Replace the last two folds with the combined fold
        folds = folds[:-2]
        folds.append(combined_fold)

    return folds
folds = create_class_folds(data, 0.2)
folds

In [None]:
import copy
def prepare_fold_class_variation(data, folds, train_test_index):

    data = copy.deepcopy(data)
    data.classes = torch.unique(data.y)
    
    known_classes = [folds[i] for i in range(len(folds)) if i < train_test_index]
    if known_classes == []:
        known_classes = torch.empty((0), dtype=torch.float32)
    else:
        known_classes = torch.cat(known_classes).sort().values
    val_classes = torch.tensor([])#folds[val_fold_index].sort().values

    
    test_classes = [folds[i] for i in range(len(folds)) if i >= train_test_index]
    if test_classes == []:
        test_classes = torch.empty((0), dtype=torch.float32)
    else:
        test_classes = torch.cat(test_classes).sort().values

    data.known_classes = known_classes
    data.val_classes = val_classes
    data.test_classes = test_classes
    data.unknown_classes = torch.cat((val_classes,test_classes))
    
    #train mask
    known_class_mask = torch.isin(data.y, known_classes)
    data.known_class_mask = known_class_mask
    data.labeled_mask = known_class_mask & data.train_mask

    #val mask
    data.val_class_mask = torch.isin(data.y, val_classes)
    data.known_class_val_mask = known_class_mask & data.val_mask
    data.unknown_class_val_mask = data.val_class_mask & data.val_mask
    data.all_class_val_mask = (known_class_mask | data.val_class_mask) & data.val_mask

    #test class mask
    test_class_mask = torch.isin(data.y, test_classes)
    data.test_class_mask = test_class_mask
    data.known_class_test_mask = known_class_mask & data.test_mask
    data.unknown_class_test_mask = test_class_mask & data.test_mask
    data.all_class_test_mask = (known_class_mask | data.test_class_mask) & data.test_mask
    
    data.unlabeled_mask = ~data.labeled_mask

    return data

data = prepare_fold_class_variation(data, folds, train_test_index=1)
print(data.known_classes, data.unknown_classes)

In [None]:
def create_fold_data_class_variation(data, name, unknown_class_ratio):

    path = Path("fold_indices/"+name+"_class_variation_class_split_"+str(unknown_class_ratio)+".pt")
    
    if path.is_file():
        folds = torch.load(path)
    else:
        folds = create_class_folds(data, unknown_class_ratio)
        torch.save(folds, path)
    
    n_folds = len(folds)
    datasets = []
    
    for test_split in range(n_folds+1):
        data_new = prepare_fold_class_variation(data, folds, test_split)
        datasets.append(data_new)

    datasets = reversed(datasets)
    return datasets

datasets = create_fold_data_class_variation(data, "photo", unknown_class_ratio=0.2)

In [None]:
for data in datasets:
    print(data.known_classes, data.unknown_classes)
    print(torch.sum(data.all_class_test_mask))

In [None]:
classes = classes[n_test_classes:]
n_val_classes = int(max(unknown_class_ratio * classes.shape[0], 2))
val_classes = classes[:n_val_classes]
val_classes

In [None]:
train_classes = classes[n_val_classes:]
train_classes

In [None]:
def create_folds_with_resampling(data, unknown_class_ratio, n_folds):
    folds = []
    
    for i in range(n_folds):
        classes = data.y.unique()
        n_classes = classes.max()+1
        indices = torch.randperm(n_classes)
        classes = classes[indices]
        n_test_classes = max(int(unknown_class_ratio * n_classes), 2)
        test_classes = classes[:n_test_classes]
        classes = classes[n_test_classes:]
        n_val_classes = max(int(unknown_class_ratio * classes.shape[0]), 2)
        val_classes = classes[:n_val_classes]
        train_classes = classes[n_val_classes:]
        folds.append((train_classes, val_classes, test_classes))
    return folds

folds = create_folds_with_resampling(data, 0.5, 5)
folds

In [None]:
def prepare_fold_masks_resampling(data, fold):
    data.classes = torch.unique(data.y)
    data.known_classes = fold[0].sort().values
    data.val_classes = fold[1].sort().values
    data.test_classes = fold[2].sort().values
    data.unknown_classes = torch.cat((data.val_classes,data.test_classes))

    #train mask
    known_class_mask = torch.isin(data.y, data.known_classes)
    data.known_class_mask = known_class_mask
    data.labeled_mask = known_class_mask & data.train_mask

    #val mask
    data.val_class_mask = torch.isin(data.y, data.val_classes)
    data.known_class_val_mask = known_class_mask & data.val_mask
    data.unknown_class_val_mask = data.val_class_mask & data.val_mask
    data.all_class_val_mask = (known_class_mask | data.val_class_mask) & data.val_mask

    #test class mask
    test_class_mask = torch.isin(data.y, data.test_classes)
    data.test_class_mask = test_class_mask
    data.known_class_test_mask = known_class_mask & data.test_mask
    data.unknown_class_test_mask = test_class_mask & data.test_mask
    data.all_class_test_mask = (known_class_mask | data.test_class_mask) & data.test_mask

    
    data.unlabeled_mask = ~data.labeled_mask

    return data

prepare_fold_masks_resampling(data, folds[0])

In [None]:
def create_fold_data_resampling(data, name, unknown_class_ratio, n_folds):

    path = Path("fold_indices/"+name+"_class_split_"+str(unknown_class_ratio)+"_w_resampling"+".pt")
    
    if path.is_file():
        folds = torch.load(path)
    else:
        folds = folds = create_folds_with_resampling(data, unknown_class_ratio, n_folds)
        torch.save(folds, path)
    
    n_folds = len(folds)
    datasets = []
    
    for fold in folds:
        data_new = prepare_fold_masks_resampling(data, fold)
        datasets.append(data_new)

    return datasets

create_fold_data_resampling(data, "cora", 0.5, 5)

In [None]:
data = load_dataset("ogb-arxiv", unknown_class_ratio = 0.5, validation_split=True)
print(data.known_classes)
data

In [None]:
data.y[data.labeled_mask].unique()

In [None]:
features = data.x
cosine_dist = features @ features.t()
cosine_mat = torch.div(cosine_dist, 0.1)
mat_max, _ = torch.max(cosine_mat, dim=1, keepdim=True)
cosine_mat = cosine_mat - torch.diag(mat_max) #- mat_max.detach()
sims, indices = torch.min(cosine_mat, dim=1)
indices.unique()

In [None]:
labels = torch.zeros(features.shape[0], features.shape[0], dtype=torch.bool)
labels[:, indices] = 1
labels[data.train_mask, data.train_mask] = torch.eq(data.y[data.train_mask] ,data.y[data.train_mask])
labels.sum()/(2708*2708)

In [33]:
data =load_dataset("cora")
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], page_rank=[2708], known_classes=[6], test_classes=[1], unknown_classes=[1], classes=[7], known_class_mask=[2708], test_class_mask=[2708], unknown_class_mask=[2708], known_class_test_mask=[2708], unknown_class_test_mask=[2708], labeled_mask=[2708], unlabeled_mask=[2708])

In [37]:
(data.y[data.edge_index[0]] == data.y[data.edge_index[1]]).sum()

tensor(8550)

In [None]:
torch.unique(data.y, return_counts=True)

In [None]:
nx_graph = to_networkx(data)
nx_graph

In [None]:
from networkx import adjacency_matrix
adj_mat = adjacency_matrix(nx_graph)
adj_mat = csr_matrix(adj_mat)
adj_mat = adj_mat.toarray()
adj_mat.shape

In [None]:
sc = SpectralClustering(n_clusters=6, affinity='precomputed_nearest_neighbors' , assign_labels ="discretize")
sc.fit(adj_mat)


In [None]:
sc.labels_
np.unique(sc.labels_, return_counts=True)

In [None]:
data = load_amazon_datasets("computers")
data

In [None]:
homophily(data.edge_index, data.y, method="edge_insensitive")

In [None]:
from handle_meta_data import load_yml
from pathlib import Path

path = Path("experiments/debug.yml")

config = load_yml(path)

from collections import namedtuple
config = namedtuple('Config', config.keys())(**config)

config


In [None]:
from torch_geometric.loader.neighbor_loader import NeighborLoader
import time
data = load_dataset("ogb-arxiv", unknown_class_ratio=0.4, validation_split=True)

nl = NeighborLoader(data, num_neighbors=[32, 16], num_workers=5, batch_size=64)

c = 0
start = time.time()
for b in nl:
    c+=1

end = time.time()
end-start

In [None]:
t = torch.arange(0,101).type(torch.float)
print(t)
t.quantile(1)

In [None]:
#wandb.init(project='ownpen_con', mode="online")
wandb.init(project='ownpen_con', mode="disabled")

In [None]:
data = load_dataset("cora", unknown_class_ratio=0.2)
data

In [None]:
def euclidean_distance(x, y):
    # Calculate element-wise squared differences
    squared_diff = (x - y)**2
    
    # Sum the squared differences along the feature dimension (axis=1)
    summed_squared_diff = squared_diff.sum(dim=1)
    
    # Take the square root to compute the Euclidean distance
    distance = summed_squared_diff.sqrt()
    
    return distance 

In [None]:
from functools import reduce
lists = [[1,2,3], [0,5,4]]
result = not reduce(set.intersection, map(set, lists))
result

In [None]:
edge_index = data.edge_index
x = data.x
src = edge_index[0,:]
dst = edge_index[1,:]
eps = 0.01
print(edge_index.shape)
w =  euclidean_distance(x[src,:], x[dst,:])


print(w)

print(w.shape)
w = 1/(w+eps)
w.shape
torch.argmax(w)
data.edge_index[1, 624]

In [None]:
data.y[0]

In [None]:
from torch_geometric.nn import LabelPropagation

lp = LabelPropagation(num_layers=5, alpha=0.9)
mask = torch.zeros_like(data.train_mask)
mask = data.train_mask
out = lp(one_hot(data.y), data.edge_index, mask=mask, edge_weight=w, post_step=None) #.max(dim=1).values
probs = F.softmax(out*5, dim=1)
# Calculate the entropy for each row (dimension "p")
entropies = -torch.sum(probs * torch.log(probs), dim=1)
print(entropies.shape)
scores, pred = probs.max(dim=1)
probs.max(dim=1)

In [None]:
p = out[5,:]
F.softmax(p, dim=0).max()

In [None]:
p[2]=p[2]+0.5
F.softmax(p, dim=0).max()

In [None]:
lp(one_hot(data.y), data.edge_index, mask=data.train_mask, edge_weight=w, post_step=None)

In [None]:
from torch_geometric.utils import degree
d = degree(data.edge_index[1,:], num_nodes=2708)

In [None]:
# Find the indices of NaN values
nan_indices = torch.nonzero(torch.isnan(entropies)).squeeze()
nan_indices

In [None]:
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.known_class_test_mask.sum())
print(f'Accuracy: {acc:.4f}')

In [None]:
tensor = torch.full((3,), 1/3)
tensor

In [None]:
from torch_geometric.datasets import Amazon
data = Amazon(root='dataset/' + "amazon_photo", name="photo")
data

In [None]:
data = Amazon(root='dataset/' + "amazon_computers", name="computers")
x = data[0].x

In [None]:
cosine_sim = F.normalize(x) @ F.normalize(x).t()
mat_max, _ = torch.max(cosine_sim, dim=1, keepdim=True)
cosine_mat = cosine_sim - mat_max.detach()
cosine_sim.min(dim=1).indices

In [None]:
cosine_mat.min(dim=1).indices

In [None]:
one_hot(data[0].y)

In [None]:
(cosine_sim.clone() - cosine_sim.mean(dim=1, keepdims=True)) #/ dist.std(dim=1, keepdims=True)

In [None]:
dist = cosine_dist

In [None]:
(dist.clone() - dist.mean(dim=1, keepdims=True)) / dist.std(dim=1, keepdims=True)

In [None]:
hidden_channels = 128
num_layers = 3
dropout = 0.7
encoder = GCN(in_channels = -1,
              hidden_channels = hidden_channels,
              out_channels = None,
              dropout = dropout,
              num_layers = num_layers)



dgi = DeepGraphInfomax(hidden_channels = hidden_channels, 
                       encoder = encoder,
                       summary = readout,
                       corruption = corrupt)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

og_model = OpenGraph(dgi, hidden_channels, data.classes.size(0), data.known_classes, data.unknown_classes, device, 
                     sup_loss_weight = 1, 
                     pseudo_loss_weight = 1,
                     unsup_loss_weight = 1, 
                     ood_percentile = 1.0,
                     proto_type="mean", pseudo_label_method = "closest")
og_model = og_model.to(device)

data = data.to(device)

optimizer = torch.optim.Adam(og_model.parameters(), lr=0.01, weight_decay=5e-4)

og_model.train()
for epoch in tqdm(range(100)):
    wandb.log({'epoch': epoch})
    loss = og_model.train_one_epoch(optimizer, data)

    pred = og_model.inference(data.x, data.edge_index).argmax(dim=1)
    correct = (pred[data.known_class_val_mask] == data.y[data.known_class_val_mask]).sum()
    val_acc = int(correct) / int(data.known_class_val_mask.sum())
    wandb.log({'val_acc': val_acc})
    
og_model.final_prototypes(data)

In [None]:
og_model.eval()
pred = og_model.inference(data.x, data.edge_index).argmax(dim=1)
correct = (pred[data.known_class_test_mask] == data.y[data.known_class_test_mask]).sum()
acc = int(correct) / int(data.known_class_test_mask.sum())
print(f'Accuracy: {acc:.4f}')

pred = og_model.inference(data.x, data.edge_index).argmax(dim=1)
correct = (pred[data.unknown_class_mask] == data.y[data.unknown_class_mask]).sum()
acc = int(correct) / int(data.unknown_class_mask.sum())
print(f'Accuracy: {acc:.4f}')

In [None]:
dgi = dgi.to(device)
data = data.to(device)
#Try RMS props?
optimizer = torch.optim.Adam(dgi.parameters(), lr=0.01, weight_decay=5e-4)

dgi.train()
for epoch in tqdm(range(500)):
    optimizer.zero_grad()
    pos_z, neg_z, summary = dgi.forward(data.x, data.edge_index)
    loss = dgi.loss(pos_z, neg_z, summary)
    loss.backward()
    optimizer.step()

In [None]:
dgi.eval()

pos_z, neg_z, summary = dgi.forward(data.x, data.edge_index)

train_z = pos_z[data.train_mask]
train_y = data.y[data.train_mask]
test_z = pos_z[data.test_mask]
test_y = data.y[data.test_mask]


dgi.test(train_z,
    train_y,
    test_z,
    test_y)