In [1]:
import pandas as pd
import json
import torch
import torch.nn.functional as F
from loguru import logger
import sys
import os
from tqdm import tqdm

import numpy as np
from torch_geometric.data import Data

from core.LLM.LLMEncoder import LLMEncoder
from core.ToTorch.DataBuilder import QADataBuilder, QAMaskBuilder
from config.config import (
    TRIPLES_PATH,
    ENTITIES_LABELS_PATH,
    PROPERTIES_LABELS_PATH,
    GRAPH_EMBEDDINGS_PATH,
    QUESTIONS_ANSWERS_PATH,
    QUESTIONS_EMBEDDINGS_PATH,
    QUESTIONS_CONCEPTS_ANSWERS_PATH,
    GRAPH_EMBEDDINGS_WITH_COMMENT_PATH,
    EXPERIMENT_RESULTS_PATH,

    TRIPLES_PATH_OLD,
    ENTITIES_LABELS_PATH_OLD,
    PROPERTIES_LABELS_PATH_OLD,
    GRAPH_EMBEDDINGS_PATH_OLD,
    QUESTIONS_CONCEPTS_ANSWERS_PATH,
    GRAPH_EMBEDDINGS_PATH_OLD, QA_TRAINING_FILE_PATH,
    QA_TESTING_FILE_PATH
)

from core.NeuralNet.GNN import GCN,RGCN

from core.experiments.utils import QAEvaluationMetrcis, load_model
import matplotlib.pyplot as plt
from collections import defaultdict
NUM_EPOCHS=1

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Example graph with 15 nodes but just 5 edges where many nodes from node 4 to node 14 are not connected with anything.
x = torch.rand(15,5)
edge_index = torch.tensor([[0, 1, 12, 12, 13], [1, 2, 13, 10, 11]], dtype=torch.long)
edge_type = torch.tensor([0, 1, 2, 0, 3], dtype=torch.long)

# Assuming node_type_ids is [0, 1, 2, 3]
node_type_ids = torch.tensor([0, 1, 2, 3], dtype=torch.long)

data = Data(x=x,edge_index=edge_index, edge_type=edge_type)
print('edge_index:',edge_index)
#x
#r_model = RGCN(num_node_features=5,dim_hidden_layer=2,num_relations=4,num_bases=None,num_layers=2,num_classes=2)

In [None]:
q_mask = [False,False,True,True,True]

masked_edge_index = edge_index[:,q_mask]
print('masked_edge_index:', masked_edge_index)
masked_edge_type = edge_type[q_mask]
print('masked_edge_type:', masked_edge_type)
q_nodes = torch.tensor([10,13,11,12])
x[q_nodes]
q_nodes_to_index = {sub_node : index for index, sub_node in enumerate (q_nodes.tolist())}
print('mapping dict:',q_nodes_to_index)

mapped_masked_edge_index = torch.tensor(
    [
       [q_nodes_to_index[head.item()] for head in masked_edge_index[0]],
       [q_nodes_to_index[tail.item()] for tail in masked_edge_index[1]]
    ],dtype=torch.long
)
mapped_masked_edge_index

In [None]:

qa_data_builder = QAMaskBuilder(
    triples_path=TRIPLES_PATH_OLD,
    entities_labels_path=ENTITIES_LABELS_PATH_OLD,
    properties_labels_path=PROPERTIES_LABELS_PATH_OLD,
    embeddings_path=GRAPH_EMBEDDINGS_PATH_OLD,
    training_questions_concepts_answers_file_path = QA_TRAINING_FILE_PATH,
    testing_questions_concepts_answers_file_path = QA_TESTING_FILE_PATH,
    questions_embeddings_path = QUESTIONS_EMBEDDINGS_PATH
    
)

In [None]:
data = qa_data_builder.build_data()

In [None]:
# TRAIN GNN
logger.info("Training GNN")
model = GCN(
    num_node_features=data.num_node_features*2, dim_hidden_layer=16,num_layers=1, num_classes=2
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)

In [None]:
for idx, row in qa_data_builder.training_questions_concepts_answers.iterrows():
    #print(row["question"])
    q_data = qa_data_builder.get_concepts_and_masks_for_question(
                                    question =row["question"], 
                                    concept_uri= row["concepts"], 
                                    answer_uri= row["answers"], 
                                    training=True)
    
    optimizer.zero_grad()
    out,embedding = model(q_data)
    loss = F.nll_loss(out[q_data.train_mask], q_data.y[q_data.train_mask],weight = torch.tensor([1.0,15.0]))
    loss.backward()
    optimizer.step()
    logger.debug(f"Epoch: {idx:03d}, Loss: {loss:.4f}")
   
    break;
q_data

In [None]:
q_data.y[q_data.train_mask]

## QA Training

In [71]:
# Build data
logger.remove()
logger.add(sys.stderr, level="DEBUG")

## CREATE DATA
logger.info("Creating Data object")

qa_data_builder = QAMaskBuilder(
    triples_path=TRIPLES_PATH_OLD,
    entities_labels_path=ENTITIES_LABELS_PATH_OLD,
    properties_labels_path=PROPERTIES_LABELS_PATH_OLD,
    embeddings_path=GRAPH_EMBEDDINGS_PATH_OLD,
    questions_concepts_answers_path=QUESTIONS_CONCEPTS_ANSWERS_PATH,
    questions_embeddings_path = QUESTIONS_EMBEDDINGS_PATH
    
)

[32m2023-09-04 17:15:51.521[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mCreating Data object[0m


TypeError: QAMaskBuilder.__init__() got an unexpected keyword argument 'questions_concepts_answers_path'

In [None]:
data = qa_data_builder.build_data()
data

In [None]:
# TRAIN GNN
logger.info("Training GNN")
model = GCN(
    num_node_features=data.num_node_features*2, dim_hidden_layer=16,num_layers=1, num_classes=2
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)

In [None]:
qa_subgraph_info = []
for idx, row in tqdm(qa_data_builder.question_concepts_answers.iterrows()):

    q_edge_mask, q_nodes, q_concept_mask, q_answer_mask, q_answer_and_random_nodes_mask =qa_data_builder.get_concepts_and_masks_for_question(question =row["question"], concept_uri= row["concepts"], answer_uri= row["answers"])
    q_edge_index = data.edge_index[:,q_edge_mask]
    q_edge_type = data.edge_type[q_edge_mask]
    q_training_x_mask = qa_data_builder.get_question_training_mask_for_x()
    q_y_labels = qa_data_builder.get_question_y_labels()
    qa_subgraph_info.append({"q_idx":idx,"q":row["question"],"q_pattern_id":row["pattern_id"],"q_edge_mask":q_edge_mask, "q_nodes":q_nodes, "q_concept_mask":q_concept_mask, "q_answer_mask":q_answer_mask, "q_answer_and_random_nodes_mask":q_answer_and_random_nodes_mask,"q_edge_index":q_edge_index,"q_edge_type":q_edge_type,"q_training_x_mask":q_training_x_mask,"q_y_labels":q_y_labels})


In [None]:
qa_subgraph_info[0]

In [None]:
# No of questions that have answer node within it
count_answers_in_subgraph = 0
for item in tqdm(qa_subgraph_info):
    if sum(item["q_answer_mask"])>0:
        count_answers_in_subgraph += 1
count_answers_in_subgraph

In [None]:
pattern_id_q_idx_node_count  = []
for idx, item in tqdm(enumerate(qa_subgraph_info)):
    node_count = len(item["q_nodes"])
    pattern_id = item['q_pattern_id']
    q_idx = item['q_idx']
    pattern_id_q_idx_node_count.append((pattern_id,q_idx,node_count))

In [None]:


counts = defaultdict(dict)

for pattern_id, question_index, nodes_in_question in pattern_id_q_idx_node_count:
    counts[pattern_id][question_index]= nodes_in_question

# Create subplots with 5 plots per row and 3 rows
num_patterns = len(counts)
rows = 5
cols = num_patterns // rows + (1 if num_patterns % rows > 0 else 0)
fig, axes = plt.subplots(rows, cols, figsize=(15, 15))

# Create plots for each pattern_id
for i, (pattern_id, question_counts) in enumerate(counts.items()):

    row = i // cols
    col = i % cols
    x = list(question_counts.keys())
    y = list(question_counts.values())

    ax = axes[row, col]
    ax.bar(x, y)
    ax.set_xlabel('Question Index')
    ax.set_ylabel('Subgraph Node Count')
    ax.set_title(f'Pattern: {pattern_id} -- #Qs {len(question_counts)} -- #Nodes(Min:{min(y)} Max:{max(y)})')

plt.tight_layout()
plt.show()

In [None]:
subgraph_node_count = [element[2] for element in pattern_id_q_idx_node_count ]

In [None]:
# Create subplots with 5 plots per row and 3 rows
num_patterns = len(counts)
rows = 5
cols = num_patterns // rows + (1 if num_patterns % rows > 0 else 0)
fig, axes = plt.subplots(rows, cols, figsize=(15, 15))

# Create plots for each pattern_id
for i, (pattern_id, question_counts) in enumerate(counts.items()):

    row = i // cols
    col = i % cols
    subgraph_node_count = list(question_counts.values())
    ax = axes[row, col]

    ax.hist(subgraph_node_count, bins=20, edgecolor='black')
    ax.set_xlabel('Subgraph Node Count')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Pattern: {pattern_id} -- #Qs {len(question_counts)} -- #Nodes(Min:{min(subgraph_node_count)} Max:{max(subgraph_node_count)})')

plt.tight_layout()
plt.show()



In [None]:
# Average number of nodes in each question
q_idx_node_count  = []
for idx, item in tqdm(enumerate(qa_subgraph_info)):
    node_count = len(item["q_nodes"])
    q_idx_node_count.append(node_count)
print("Average nodes in the subgraphs :",round(sum(q_idx_node_count)/idx,2))
print("Maximum nodes in the subgraph :",max(q_idx_node_count))
print("Minimum nodes in the subgraph :",min(q_idx_node_count))

In [None]:
import matplotlib.pyplot as plt
# Plot the distribution
plt.plot(q_idx_node_count[:])
plt.xlabel('Q_idx')
plt.ylabel('q_idx_node_count')
plt.title('Distribution of subgrapgh node count')
plt.show()


In [None]:
shuffled_indices = np.random.permutation(qa_data_builder.question_concepts_answers.index)
shuffled_indices

In [None]:
model.train()
limit=1
i=0
for idx in shuffled_indices:
    row = qa_data_builder.question_concepts_answers.loc[idx]
    q_embedding = qa_data_builder.questions_to_embeddings[row["question"]]
    q_x = qa_data_builder.get_x(to_concat=q_embedding)
    q_edge_mask, q_nodes, q_concept_mask, q_answer_mask, q_answer_and_random_nodes_mask =qa_data_builder.get_concepts_and_masks_for_question(question =row["question"], concept_uri= row["concepts"], answer_uri= row["answers"])
    q_edge_index = data.edge_index[:,q_edge_mask]
    q_edge_type = data.edge_type[q_edge_mask]
    q_training_x_mask = qa_data_builder.get_question_training_mask_for_x()
    q_y_labels = qa_data_builder.get_question_y_labels()
    q_data = Data(x=q_x, edge_index=q_edge_index, edge_type=q_edge_type, train_mask=q_training_x_mask, y=q_y_labels)
    print(f'Training for Q {idx} : {row["question"]}')
    for epoch in range(200):
        optimizer.zero_grad()
        out,embedding = model(q_data)
        loss = F.nll_loss(out[q_data.train_mask], q_data.y[q_data.train_mask],weight = torch.tensor([1.0,15.0]))
        loss.backward()
        optimizer.step()
        if epoch%50==0:
            logger.debug(f"Epoch: {epoch:03d}, Loss: {loss:.4f}")
    if i==limit:
        break;
    i+=1

In [None]:
limit=2
i=0
# Evaluation
res = []
for idx in shuffled_indices:
    
    if i==limit:
        break;

    row = qa_data_builder.question_concepts_answers.loc[idx]
    q_embedding = qa_data_builder.questions_to_embeddings[row["question"]]
    q_x = qa_data_builder.get_x(to_concat=q_embedding)
    q_edge_mask, q_nodes, q_concept_mask, q_answer_mask, q_answer_and_random_nodes_mask =qa_data_builder.get_concepts_and_masks_for_question(question =row["question"], concept_uri= row["concepts"], answer_uri= row["answers"])
    q_edge_index = data.edge_index[:,q_edge_mask]
    q_edge_type = data.edge_type[q_edge_mask]
    q_training_x_mask = qa_data_builder.get_question_training_mask_for_x()
    q_y_labels = qa_data_builder.get_question_y_labels()
    q_data = Data(x=q_x,edge_index=q_edge_index,edge_type=q_edge_type,train_mask =q_training_x_mask,y=q_y_labels)
    model.eval()
    print(f'Predicting for Q {idx} : {row["question"]}')
    out,_ = model(q_data)
    predicted_answer_nodes = torch.where(out.argmax(dim=1))[0]
    predicted_answer_node_probabilities = out.max(dim=1)[0][predicted_answer_nodes]
    sorted_probability_indices = torch.argsort(predicted_answer_node_probabilities, descending= True)
    count_predicted_nodes =len(predicted_answer_nodes)
    actual_answer_nodes = q_nodes[q_answer_mask].tolist()
    if count_predicted_nodes > 0:
        logger.debug(f"answers predicted")
        is_predicted_in_actual_answers = bool(set(actual_answer_nodes) & set(predicted_answer_nodes[sorted_probability_indices].tolist()))
        res.append((idx, actual_answer_nodes, predicted_answer_nodes[sorted_probability_indices].tolist(),predicted_answer_node_probabilities[sorted_probability_indices].tolist(),count_predicted_nodes,is_predicted_in_actual_answers))
    
    
    else:
        logger.debug(f"NO answers found")
        res.append((idx, actual_answer_nodes, np.nan,np.nan,0,False))
    
    i+=1

In [None]:
eval_res = pd.DataFrame.from_records(res,columns=["q_idx","actual_answer_nodes","predicted_answer_nodes","probabilities_of_answer_nodes","count_predicted_nodes","is_predicted_in_actual"])
eval_res

In [None]:
q_idx = 254
#(q_edge_mask, q_nodes, q_concept_mask, q_answer_mask, q_answer_and_random_nodes_mask,q_edge_index,q_edge_type,q_training_x_mask,q_y_labels)
print("Total nodes in the subgraph",len(set(qa_subgraph_info[q_idx]['q_nodes'].tolist())))

len(set(eval_res.loc[eval_res["q_idx"]==q_idx,"predicted_answer_nodes"].iloc[0]) & set(qa_subgraph_info[q_idx]['q_nodes'].tolist()))
#set(eval_res.loc[eval_res["q_idx"]==q_idx,"predicted_answer_nodes"].iloc[0]) & set(qa_subgraph_info[q_idx]['q_nodes'].tolist())

In [None]:
limit=1
i=0
for idx, row in eval_res.iterrows():
    if row['count_predicted_nodes'] < 30 and row['count_predicted_nodes'] >0 :
        print(f'Question: {qa_data_builder.question_concepts_answers.loc[row["q_idx"],"question"]}')
        print(f'Concepts: {qa_data_builder.question_concepts_answers.loc[row["q_idx"],"concepts"]}')
        print(f'Actual Answer: {row["actual_answer_nodes"]} -- URIs :{qa_data_builder.question_concepts_answers.loc[row["q_idx"],"answers"]}')
        if row["is_predicted_in_actual"]:
            for node in row["predicted_answer_nodes"]:
                
                print(f'Preicted Node: {node} -- URI:{qa_data_builder.index_to_entity[node]}')
        else:
            print("Actua answer is not in the predicted nodes list.")
            for node in row["predicted_answer_nodes"]:
                print(f'Preicted Node: {node} -- URI:{qa_data_builder.index_to_entity[node]}')
    
    if i==limit:
        break;
    i+=1

In [None]:
import ast

In [None]:
type('str')

In [None]:
#./core/experiments/qa/results/20230825123318/evaluation_results.csv

eval_results = pd.read_csv('C:/Users/I583773/Documents/Thesis/evaluation_results.csv')
# reads columns as lists instead of strings
list_columns = ['predicted_answer_nodes', 'probabilities_of_answer_nodes']
for col in list_columns:
    eval_results[col] = eval_results[col].apply(lambda x : ast.literal_eval(x) if type(x)==str else [])
#\20230825123318
eval_results.head()

In [None]:
eval_results.columns

In [None]:
sum(eval_results.is_predicted_in_actual)

In [None]:
mask = (eval_results['count_predicted_nodes'] ==3)

limit=1
i=0
for idx, row in eval_results[mask].iterrows():

    print(f'Question: {qa_data_builder.question_concepts_answers.loc[row["q_idx"],"question"]}')
    print(f'Concepts: {qa_data_builder.question_concepts_answers.loc[row["q_idx"],"concepts"]}')
    print(f'Actual Answer: {row["actual_answer_nodes"]} -- URIs :{qa_data_builder.question_concepts_answers.loc[row["q_idx"],"answers"]}')
    if row["is_predicted_in_actual"]:
        for node in row["predicted_answer_nodes"]:
        
            
            print(f'Preicted Node: {node} -- URI:{qa_data_builder.index_to_entity[node]}')
    else:
        print("Actua answer is not in the predicted nodes list.")
        for node in row["predicted_answer_nodes"]:
            print(f'Preicted Node: {node} -- URI:{qa_data_builder.index_to_entity[node]}')
    
    if i==limit:
        break;
    i+=1

In [None]:
class QAEvaluationMetrcis:
    def __init__(self,model_prediction_path:str):

        
        self.evaluation_results = pd.read_csv(os.path.join(model_prediction_path,'evaluation_results.csv'))
        
        # reads columns as lists instead of strings
        list_type_columns = ['actual_answer_nodes','predicted_answer_nodes', 'probabilities_of_answer_nodes']
        for col in list_type_columns:
            self.evaluation_results[col] = self.evaluation_results[col].apply(lambda x : ast.literal_eval(x) if type(x)==str else [])

        #one_answer_mask = self.evaluation_results['actual_answer_nodes'].apply(lambda x : True if len(x)==1 else False )

    def hits_at_k(self,predictions, actual, k):
        hits = 0
        for pred_nodes, actual_node in zip(predictions, actual):
            if any(node in pred_nodes[:k] for node in actual_node):
                hits += 1
        return hits / len(predictions)

    
    def reciprocal_rank(self, predictions, actual):
        ranks = []
        for pred_nodes, actual_node in zip(predictions, actual):
            if any(node in pred_nodes for node in actual_node):
                rank = pred_nodes.index(actual_node[0]) + 1 if actual_node[0] in pred_nodes else 0
                ranks.append(1 / rank if rank > 0 else 0)
        return sum(ranks) / len(predictions)
    
    def precision_at_k(self,predictions, actual, k):
        correct_predictions = 0
        total_predictions = 0
        for pred_nodes, actual_node in zip(predictions, actual):
            correct_predictions += len(set(pred_nodes[:k]) & set(actual_node))
            total_predictions += k
        return correct_predictions / total_predictions
    
    def recall_at_k(self, predictions, actual, k):
        correct_predictions = 0
        total_actual = 0
        for pred_nodes, actual_node in zip(predictions, actual):
            correct_predictions += len(set(pred_nodes[:k]) & set(actual_node))
            total_actual += len(actual_node)
        return correct_predictions / total_actual



    def run_evaluation(self):

        self.hits_1 = self.hits_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=1)
        self.hits_3 = self.hits_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=3)
        self.hits_5 = self.hits_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=5)
        self.mrr = self.reciprocal_rank(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'])
        self.recall_1 = self.recall_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=1)
        self.recall_3 = self.recall_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=3)
        self.recall_5 = self.recall_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=5)
        self.precision_1 = self.precision_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=1)
        self.precision_3 = self.precision_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=3)
        self.precision_5 = self.precision_at_k(self.evaluation_results['predicted_answer_nodes'], self.evaluation_results['actual_answer_nodes'], k=5)
                
        return self.hits_1, self.hits_3, self.hits_5, self.mrr, self.precision_1, self.precision_3 ,self.precision_5, self.recall_1, self.recall_3, self.recall_5

In [None]:
# evaluation metrics functionality

#./core/experiments/qa/results/20230825123318/evaluation_results.csv
#eval_results = pd.read_csv('C:/Users/I583773/Documents/Thesis/evaluation_results.csv')
path = './core/experiments/qa/results/20230825123318/'
evaluate_model = QAEvaluationMetrcis(path)

hits_1, hits_3, hits_5, MRR, precision_1, precision_3, precision_5, recall_1, recall_3, recall_5 = evaluate_model.run_evaluation()
print(f'hits@1 : {np.round(hits_1,2)} -- hits@3 : {np.round(hits_3,2)} -- hits@5 : {np.round(hits_5,2)} -- MRR : {np.round(MRR,2)}\nprecision@1 : {np.round(precision_1,2)}, --precision@3 : {np.round(precision_3,2)} --precision@5 : {np.round(precision_5,2)}\nrecall@1 : {np.round(recall_1,2)}, --recall@3 : {np.round(recall_3,2)} --recall@5 : {np.round(recall_5,2)}')

In [None]:
one_answer_mask = evaluation_results['actual_answer_nodes'].apply(lambda x : True if len(x)==1 else False )
sum(one_answer_mask)

In [None]:
hits_at_k(evaluation_results[one_answer_mask]['predicted_answer_nodes'], evaluation_results[one_answer_mask]['actual_answer_nodes'], k=2)

In [None]:
#load evaluation file
model_prediction_path = './core/experiments/qa/results/20230825123318/'
evaluation_results = pd.read_csv(os.path.join(model_prediction_path,'evaluation_results.csv'))
# reads columns as lists instead of strings
list_type_columns = ['actual_answer_nodes','predicted_answer_nodes', 'probabilities_of_answer_nodes']
for col in list_type_columns:
    evaluation_results[col] = evaluation_results[col].apply(lambda x : ast.literal_eval(x) if type(x)==str else [])

In [64]:
# read experiment metadata
exp_metadata = pd.read_csv('./core/experiments/qa/qa_experiments_masterdata.csv')
exp_metadata[['time_stamp', 'Epochs', 'Learning Rate', 'hidden_dimension',
       'num_layers', 'num_bases', 'Model', 'hits@1', 'hits@3', 'hits@5', 'mrr', 'precision@1', 'precision@3', 'precision@5', 'recall@1', 'recall@3',
       'recall@5']].head(1)
len(exp_metadata)

138

In [None]:
#exp_metadata[['precision@1', 'precision@3', 'precision@5', 'recall@1', 'recall@3','recall@5']]

In [65]:
exp_metadata['F1@1'] = 2*(exp_metadata['precision@1']*exp_metadata['recall@1'])/(exp_metadata['precision@1']+exp_metadata['recall@1'])
exp_metadata['F1@3'] = 2*(exp_metadata['precision@3']*exp_metadata['recall@3'])/(exp_metadata['precision@3']+exp_metadata['recall@3'])
exp_metadata['F1@5'] = 2*(exp_metadata['precision@5']*exp_metadata['recall@5'])/(exp_metadata['precision@5']+exp_metadata['recall@5'])

In [66]:
# GCN
gcn_mask = ( (exp_metadata['Epochs']==20) &  (exp_metadata['Model'].str.startswith('GCN')))
print('Total GCN experiments: ',sum(gcn_mask))
#exp_metadata[gcn_mask][['time_stamp', 'Epochs', 'Learning Rate', 'hidden_dimension','num_layers', 'num_bases', 'Model', 'hits@1', 'hits@3', 'hits@5', 'mrr']]

Total GCN experiments:  21


In [67]:
# R-GAT
rgat_mask = ((exp_metadata['Epochs']==20) & (exp_metadata['num_bases'].isna()) & (exp_metadata['Model'].str.startswith('RGAT')))
print('Total GCN experiments: ',sum(rgat_mask))
#exp_metadata[mask][['time_stamp', 'Epochs', 'Learning Rate', 'hidden_dimension','num_layers', 'num_bases', 'Model', 'hits@1', 'hits@3', 'hits@5', 'mrr']]

Total GCN experiments:  22


In [68]:
# R-GCN
rgcn_mask = ((exp_metadata['Epochs']==20) & (exp_metadata['num_bases'].isna()) & (exp_metadata['Model'].str.startswith('RGCN')))
print('Total R-GCN experiments: ',sum(rgcn_mask))
#exp_metadata[rgcn_mask][['time_stamp', 'Epochs', 'Learning Rate', 'hidden_dimension','num_layers', 'num_bases', 'Model', 'hits@1', 'hits@3', 'hits@5', 'mrr']].tail()

Total R-GCN experiments:  24


In [69]:
def mean_sem(exp_data, mask, col):

    sd = np.std(exp_data[mask][col])
    mean = np.mean(exp_data[mask][col])
    sem = sd/np.sqrt(len(exp_data[mask][col]))
    print(f'{col} Mean : {np.round(mean,3)} ± {np.round(sem,3)}')
    return np.round(mean,3),np.round(sem,3)


In [70]:
#GCN results
mean_sem(exp_metadata, gcn_mask, 'hits@1')
mean_sem(exp_metadata, gcn_mask, 'hits@3')
mean_sem(exp_metadata, gcn_mask, 'hits@5')
mean_sem(exp_metadata, gcn_mask, 'mrr')
mean_sem(exp_metadata, gcn_mask, 'F1@1')

hits@1 Mean : 0.225 ± 0.016
hits@3 Mean : 0.422 ± 0.019
hits@5 Mean : 0.541 ± 0.026
mrr Mean : 0.329 ± 0.017
F1@1 Mean : 0.191 ± 0.014


(0.191, 0.014)

In [62]:
#R-GCN results
mean_sem(exp_metadata, rgcn_mask, 'hits@1')
mean_sem(exp_metadata, rgcn_mask, 'hits@3')
mean_sem(exp_metadata, rgcn_mask, 'hits@5')
mean_sem(exp_metadata, rgcn_mask, 'mrr')
mean_sem(exp_metadata, rgcn_mask, 'F1@1')
mean_sem(exp_metadata, rgcn_mask, 'F1@3')
mean_sem(exp_metadata, rgcn_mask, 'F1@5')

hits@1 Mean : 0.833 ± 0.01
hits@3 Mean : 0.909 ± 0.004
hits@5 Mean : 0.923 ± 0.003
mrr Mean : 0.818 ± 0.006
F1@1 Mean : 0.709 ± 0.008
F1@3 Mean : 0.506 ± 0.003
F1@5 Mean : 0.375 ± 0.001


(0.375, 0.001)

In [63]:
#R-GAT results
mean_sem(exp_metadata, rgat_mask, 'hits@1')
mean_sem(exp_metadata, rgat_mask, 'hits@3')
mean_sem(exp_metadata, rgat_mask, 'hits@5')
mean_sem(exp_metadata, rgat_mask, 'mrr')
mean_sem(exp_metadata, rgat_mask, 'F1@1')
mean_sem(exp_metadata, rgat_mask, 'F1@3')
mean_sem(exp_metadata, rgat_mask, 'F1@5')

hits@1 Mean : 0.641 ± 0.013
hits@3 Mean : 0.85 ± 0.007
hits@5 Mean : 0.894 ± 0.005
mrr Mean : 0.716 ± 0.009
F1@1 Mean : 0.546 ± 0.011
F1@3 Mean : 0.451 ± 0.005
F1@5 Mean : 0.349 ± 0.002


(0.349, 0.002)

In [None]:
mask = ((exp_metadata['Epochs']==2) & (exp_metadata['num_bases'].isna()) & (exp_metadata['Model'].str.startswith('RGAT')))
exp_metadata[mask][['time_stamp', 'Epochs', 'Learning Rate', 'hidden_dimension',
       'num_layers', 'num_bases', 'Model', 'hits@1', 'hits@3', 'hits@5', 'mrr']]

In [None]:
#20230830082302  -- 30 bases
#20230830080104 -- 80 bases
#20230825123318  -- 193 bases
RGCNmodel_with_193_bases = load_model('./core/experiments/qa/results/20230825123318/RGCNmodel.pt')
RGCNmodel_with_80_bases = load_model('./core/experiments/qa/results/20230830080104/RGCNmodel.pt')
RGCNmodel_with_30_bases = load_model('./core/experiments/qa/results/20230830082302/RGCNmodel.pt')


In [None]:
RGCNmodel_with_193_bases.eval()
# Iterate through the parameters and identify base matrices
for param in RGCNmodel_with_193_bases.parameters():
    print(param.shape)

total_params_193_bases = sum(p.numel() for p in RGCNmodel_with_193_bases.parameters())
print(f"Total number of parameters with 193 bases matrices: {total_params_193_bases}")


In [None]:
RGCNmodel_with_80_bases.eval()


# Iterate through the parameters and identify base matrices
for param in RGCNmodel_with_80_bases.parameters():
    print(param.shape)

total_params_80_bases = sum(p.numel() for p in RGCNmodel_with_80_bases.parameters())
print(f"Total number of parameters with 80 bases matrices: {total_params_80_bases}")

In [None]:
RGCNmodel_with_30_bases.eval()

# Iterate through the parameters and identify base matrices
for param in RGCNmodel_with_30_bases.parameters():
    print(param.shape)

total_params_30_bases = sum(p.numel() for p in RGCNmodel_with_30_bases.parameters())
print(f"Total number of parameters with 30 bases matrices: {total_params_30_bases}")

In [None]:
plt.bar(['30_bases','80_bases','193_bases'],[total_params_30_bases,total_params_80_bases,total_params_193_bases])
plt.xlabel('#Bases')
plt.ylabel('Model Parameter Count')
plt.title('Bases Vs Model parameters')
plt.show()

In [None]:
RGCNmodel_with_193_bases