In [None]:
! pip install torch==2.1.0  torchvision==0.16.0 torchtext==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
! pip install torch_geometric==2.4
! pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
! pip install sentence-transformers
! pip install torcheval
! pip install matplotlib
! pip install pandas
! pip install tensorboard

In [None]:
from torch_geometric.data import HeteroData
import pandas as pd
import numpy as np 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import torch 
data = torch.load('/kaggle/input/twibot22-pyggraph/TwiBot22_Graph_with_degreecounts_with_y.pt')

for node_type in data.node_types:
    data[node_type].x = data[node_type].x.float()
    
node_ids = torch.load('/kaggle/input/twibot22-pyggraph/unique_nodes.pt')
test_df = pd.read_csv('/kaggle/input/twibot22-pyggraph/test.csv')
test_df = test_df[test_df['user_id'].isin(node_ids)]
test_df['index'] = test_df['user_id'].apply(lambda x: node_ids[x])


In [None]:
# Define GNN Model
from torch_geometric.nn import HGTConv, Linear

from torch.nn import functional as F

class HGT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads, num_layers, node_types, data_metadata):
        super().__init__()

        self.lin_dict = torch.nn.ModuleDict()
        for node_type in node_types:
            self.lin_dict[node_type] = Linear(-1, hidden_channels)

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HGTConv(hidden_channels, hidden_channels, data_metadata,
                           num_heads, group='sum')
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = {
            node_type: self.lin_dict[node_type](x).relu_()
            for node_type, x in x_dict.items()
        }

        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)

        return self.lin(x_dict['user'])
    
    
    
model = HGT(hidden_channels=256, out_channels=1, num_heads=8, num_layers=2, node_types=data.node_types, data_metadata=data.metadata())


In [None]:
# create minibatch loader
from torch_geometric.loader import HGTLoader
batch_size = 32
num_node_types = len(data.node_types)
one_hop_neighbors = (20 * batch_size)//num_node_types # per relationship type
two_hop_neighbors = (20 * 8 * batch_size)//num_node_types # per relationship type
#three_hop_neighbors = (20 * 8 * 3 * batch_size)//num_node_types # per relationship type
num_neighbors = [one_hop_neighbors, two_hop_neighbors]



In [None]:
# train model 
from tqdm.auto import tqdm
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path



In [None]:
# get weights after training
import gc 
def init(model, state_dict_path):
    with torch.no_grad():
        model.eval()
        for node_type in data.node_types:
            print(node_type)

            loader = HGTLoader(
                    data,
                    # Sample 512 nodes per type and per iteration for 4 iterations
                    num_samples=num_neighbors,
                    batch_size=64, #96 or 32 nodes
                    input_nodes=node_type,
                    num_workers=0,
                    pin_memory=True,
                    prefetch_factor=None,
                )
            minibatch = next(iter(loader))

            model(minibatch.x_dict, minibatch.edge_index_dict)
            model_and_optimizer = torch.load(state_dict_path)
            model.load_state_dict(model_and_optimizer['model_state_dict'])
            del loader
            gc.collect()




In [None]:
del node_ids
gc.collect()
data = data.cuda()


model.cuda()
with open('/kaggle/working/scores.csv', 'w') as csv_file:
    csv_file.write('Samplesseen,F1,precision,recall\n')
    for path in [
        '/kaggle/input/twibot22-hgt-models/model_samplesseen6407872.pt',
        '/kaggle/input/twibot22-hgt-models/model_samplesseen6567872.pt',
        '/kaggle/input/twibot22-hgt-models/model_samplesseen6727872.pt',
        '/kaggle/input/twibot22-hgt-models/model_samplesseen6887872.pt',
        '/kaggle/input/twibot22-hgt-models/model_samplesseen7047872.pt',
        '/kaggle/input/twibot22-hgt-models/model_samplesseen7175872.pt'
        
        
    ]:
        print('Loading state dict:',path)

        init(model,path)
        test_loader = HGTLoader(
            data,
            num_samples=num_neighbors,
            batch_size=32,
            input_nodes=('user', test_df['index'].values.tolist()),  # in testing the model can see the training user nodes, but only test nodes are used for testing
            num_workers=4,
            pin_memory=True,
            prefetch_factor=2,
            shuffle=True
        )

        with torch.no_grad():
            model.eval()
            y_hat = []
            y = []
            for i,minibatch in tqdm(enumerate(test_loader)):
                out = model(minibatch.x_dict, minibatch.edge_index_dict)
                yhat_b = (out>0).cpu().numpy()
                y_b = minibatch['user'].y.unsqueeze(1).float().cpu().numpy()
                print(i, f1_score(yhat_b, y_b), end='\r')
                y_hat.append(yhat_b)
                y.append(y_b)

            y_hat = np.concatenate(y_hat)
            y = np.concatenate(y)
            print('')
            f1= f1_score(y_hat, y)
            samplesseen = int(path.replace('/kaggle/input/twibot22-hgt-models/model_samplesseen','').replace('.pt',''))
            print('final',f1, 'samplesseen',samplesseen)
            print('')
            precision = precision_score(y_hat,y)
            recall = recall_score(y_hat,y)
            csv_file.write(f'{samplesseen},{f1},{precision},{recall}\n')

        del test_loader