In [None]:
!pip install -r requirements.txt

In [1]:
import time
import os
import pandas as pd
import numpy as np

from dataloader import GraphTextDataset, GraphDataset, TextDataset

import torch
from torch import optim
from torch_geometric.data import DataLoader
from torch.utils.data import DataLoader as TorchDataLoader
from transformers import AutoTokenizer
from torchmetrics.functional import pairwise_cosine_similarity


from alignment import AlignmentModel,Discriminator, gradient_penalty

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import label_ranking_average_precision_score

from tqdm import tqdm

In [2]:
def hard_triplet_loss(graph_embeddings, text_embeddings, margin = 0.3):
    cosine = pairwise_cosine_similarity(text_embeddings, graph_embeddings) # compute cosine similarity between each pairs (texts, graphs)
    positive_sample = cosine.diag() # get similarity between anchor and positive sample where anchor could be the text representation and positive sample the graph represention and vice versa
    cosine = cosine.fill_diagonal_(-2) # set diag val to a minimum possible value of similarity to get hard negetive example by argmax
    loss = torch.clamp(torch.max(cosine, axis = 1)[0] - positive_sample + margin,0)
    loss += torch.clamp(torch.max(cosine, axis = 0)[0] - positive_sample +  margin,0)
    loss = torch.mean(loss)
    return loss

### **LOADING DATASET**

In [4]:
batch_size = 32

model_name = 'allenai/scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
gt = np.load("./data/token_embedding_dict.npy", allow_pickle=True)[()]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_dataset = GraphTextDataset(root='./data/', gt=gt, split='val', tokenizer=tokenizer)
train_dataset = GraphTextDataset(root='./data/', gt=gt, split='train', tokenizer=tokenizer)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



### **LOADING FINETUNED MODELS**

In [5]:
model_transformers = AlignmentModel(in_channels=300, out_channels=300, graph_attention_head=6, type ='TransformerConv' )
model_transformers.to(device)

path_to_model_transformer = 'TransformerConv_model_pretrained_text.pt'
checkpoint = torch.load(path_to_model_transformer)
model_transformers.load_state_dict(checkpoint['model_state_dict'])
model_transformers.eval()

model_GPS = AlignmentModel(in_channels=300, out_channels=300, graph_attention_head=6, type = 'GPS')
model_GPS.to(device)

path_to_model_gps = 'GPS_model_pretrained_text.pt'
checkpoint = torch.load(path_to_model_gps)
model_GPS.load_state_dict(checkpoint['model_state_dict'])
model_GPS.eval()

model_GIN = AlignmentModel(in_channels=300, out_channels=300, graph_attention_head=6, type = 'GIN')
model_GIN .to(device)

path_to_model_gin= 'GIN_model_pretrained_text.pt'
checkpoint = torch.load(path_to_model_gin)
model_GIN .load_state_dict(checkpoint['model_state_dict'])
model_GIN .eval()

model_EGC = AlignmentModel(in_channels=300, out_channels=300, graph_attention_head=6, type = 'EGC')
model_EGC .to(device)

path_to_model_egc= 'EGC_model_pretrained_text.pt'
checkpoint = torch.load(path_to_model_egc)
model_EGC .load_state_dict(checkpoint['model_state_dict'])
model_EGC .eval()

model_Antisymmetric = AlignmentModel(in_channels=300, out_channels=300, graph_attention_head=6, type = 'Antisymmetric')
model_Antisymmetric .to(device)

path_to_model_Antisymmetric= 'Antisymmetric_model_pretrained_text.pt'
checkpoint = torch.load(path_to_model_Antisymmetric)
model_Antisymmetric .load_state_dict(checkpoint['model_state_dict'])
model_Antisymmetric .eval()

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing Ber

AlignmentModel(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

### **PREDICTIONS ON THE VALIDATION TESTS**

In [6]:
graph_transformers = []
text_transformers = [] 

graph_gps = []
text_gps = []

graph_gin = []
text_gin = []

graph_egc = []
text_egc = []

graph_Antisymmetric = []
text_Antisymmetric = []

for batch in tqdm(val_loader):
    output = model_transformers.forward_graph(batch.to(device))
    graph_transformers.extend(output.tolist())
        
    output = model_GPS.forward_graph(batch.to(device))
    graph_gps.extend(output.tolist())

    output = model_GIN.forward_graph(batch.to(device))
    graph_gin.extend(output.tolist())

    output = model_EGC.forward_graph(batch.to(device))
    graph_egc.extend(output.tolist())

    output = model_Antisymmetric.forward_graph(batch.to(device))
    graph_Antisymmetric.extend(output.tolist())
    
#for batch in val_loader:
    output = model_transformers.forward_text(batch['input_ids'].to(device), batch['attention_mask'].to(device))
    text_transformers.extend(output.tolist())
        
    output = model_GPS.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_gps.extend(output.tolist())
    
    output = model_GIN.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_gin.extend(output.tolist())

    output = model_EGC.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_egc.extend(output.tolist())

    output = model_Antisymmetric.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_Antisymmetric.extend(output.tolist())



100%|██████████| 104/104 [00:36<00:00,  2.84it/s]


### **EVALUATION ON THE VALIDATION TESTS**

In [7]:
similarity_transformers = cosine_similarity(text_transformers, graph_transformers)
y_true = np.eye(len(similarity_transformers))
score_transformers = label_ranking_average_precision_score(y_true, similarity_transformers)
score_transformers

0.8343528594524177

In [8]:
similarity_gps = cosine_similarity(text_gps, graph_gps)
y_true = np.eye(len(similarity_gps))
score_gps = label_ranking_average_precision_score(y_true, similarity_gps)
score_gps

0.8533235998279229

In [9]:
similarity_gin = cosine_similarity(text_gin, graph_gin)
y_true = np.eye(len(similarity_gin))
score_gin = label_ranking_average_precision_score(y_true, similarity_gin)
score_gin

0.8189375551353847

In [10]:
similarity_egc = cosine_similarity(text_egc, graph_egc)
y_true = np.eye(len(similarity_egc))
score_egc = label_ranking_average_precision_score(y_true, similarity_egc)
score_egc

0.8352084282545114

In [11]:
similarity_Antisymmetric = cosine_similarity(text_Antisymmetric, graph_Antisymmetric)
y_true = np.eye(len(similarity_Antisymmetric))
score_Antisymmetric = label_ranking_average_precision_score(y_true, similarity_Antisymmetric)
score_Antisymmetric

0.8438302913068332

In [12]:
label_ranking_average_precision_score(y_true, similarity_gps + similarity_transformers + similarity_egc + similarity_gin + similarity_Antisymmetric)

0.8921195846260427

In [None]:
for alpha in np.arange(0.2,0.5,1/30):
    for beta in np.arange(0.2,0.5,1/30):
        print(f'{alpha:.2f}, {beta:.2f} : {label_ranking_average_precision_score(y_true, alpha*similarity_gps + beta*similarity_transformers + (1-alpha-beta)*similarity_gin)*100:.3f}')

In [13]:
pd.DataFrame(similarity_transformers).to_csv('similarity_transformers_val.csv', index=False)
pd.DataFrame(similarity_gps).to_csv('similarity_gps_val.csv', index=False)
pd.DataFrame(similarity_gin).to_csv('similarity_gin_val.csv', index=False)
pd.DataFrame(similarity_egc).to_csv('similarity_egc_val.csv', index=False)
pd.DataFrame(similarity_Antisymmetric).to_csv('similarity_antisymmetric_val.csv', index=False)



### **PREDICTIONS ON TEST SET**

In [18]:
test_cids_dataset = GraphDataset(root='./data/', gt=gt, split='test_cids')
test_text_dataset = TextDataset(file_path='./data/test_text.txt', tokenizer=tokenizer)

idx_to_cid = test_cids_dataset.get_idx_to_cid()

graph_transformers = []
text_transformers = [] 

graph_gps = []
text_gps = []

graph_gin = []
text_gin = []

graph_egc = []
text_egc = []

graph_Antisymmetric = []
text_Antisymmetric = []

In [19]:
test_graph_loader = DataLoader(test_cids_dataset, batch_size=batch_size, shuffle=False)

for batch in test_graph_loader:
    output = model_transformers.forward_graph(batch.to(device))
    graph_transformers.extend(output.tolist())
        
    output = model_GPS.forward_graph(batch.to(device))
    graph_gps.extend(output.tolist())

    output = model_GIN.forward_graph(batch.to(device))
    graph_gin.extend(output.tolist())

    output = model_EGC.forward_graph(batch.to(device))
    graph_egc.extend(output.tolist())

    output = model_Antisymmetric.forward_graph(batch.to(device))
    graph_Antisymmetric.extend(output.tolist())
    
test_text_loader = TorchDataLoader(test_text_dataset, batch_size=batch_size, shuffle=False)

for batch in test_text_loader:
    output = model_transformers.forward_text(batch['input_ids'].to(device), batch['attention_mask'].to(device))
    text_transformers.extend(output.tolist())
        
    output = model_GPS.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_gps.extend(output.tolist())

    output = model_GIN.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_gin.extend(output.tolist())

    output = model_EGC.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_egc.extend(output.tolist())

    output = model_Antisymmetric.forward_text(batch['input_ids'].to(device),batch['attention_mask'].to(device))
    text_Antisymmetric.extend(output.tolist())





In [21]:
similarity_transformers = cosine_similarity(text_transformers,  graph_transformers)
similarity_gps = cosine_similarity(text_gps, graph_gps)
similarity_gin = cosine_similarity(text_gin, graph_gin)
similarity_egc = cosine_similarity(text_egc, graph_egc)
similarity_antisymmetric = cosine_similarity(text_Antisymmetric, graph_Antisymmetric)


similarity = similarity_transformers + similarity_gps + similarity_gin + similarity_antisymmetric + similarity_egc

In [22]:
pd.DataFrame(similarity_transformers).to_csv('similarity_transformers_test.csv', index=False)
pd.DataFrame(similarity_gps).to_csv('similarity_gps_test.csv', index=False)
pd.DataFrame(similarity_gin).to_csv('similarity_gin_test.csv', index=False)
pd.DataFrame(similarity_egc).to_csv('similarity_egc_test.csv', index=False)
pd.DataFrame(similarity_Antisymmetric).to_csv('similarity_antisymmetric_test.csv', index=False)



In [23]:
from sklearn.metrics.pairwise import cosine_similarity

solution = pd.DataFrame(similarity)
solution['ID'] = solution.index
solution = solution[['ID'] + [col for col in solution.columns if col!='ID']]
solution.to_csv('submission_gin+gps+egc+antisymmetric+transformerconv_text_pretrained_08921.csv', index=False)

In [None]:
order_transformers = np.argsort(-similarity_transformers, axis = 1) + 1
order_gps = np.argsort(-similarity_gps, axis = 1) + 1
order_coeff = 2/(order_transformers + order_gps)

In [None]:
scores = np.sign(similarity_transformers)*np.log(1 + 1/order_transformers)*np.abs(similarity_transformers)

In [None]:
label_ranking_average_precision_score(y_true, coefs)