In [23]:
import pandas as pd
import json
import torch
import torch.nn.functional as F
from loguru import logger
import sys

import numpy as np
from torch_geometric.data import Data

from core.LLM.LLMEncoder import LLMEncoder
from core.ToTorch.DataBuilder import QADataBuilder, QAMaskBuilder
from config.config import (
    TRIPLES_PATH,
    ENTITIES_LABELS_PATH,
    PROPERTIES_LABELS_PATH,
    GRAPH_EMBEDDINGS_PATH,
    QUESTIONS_ANSWERS_PATH,
    QUESTIONS_EMBEDDINGS_PATH,
    QUESTIONS_CONCEPTS_ANSWERS_PATH,
    GRAPH_EMBEDDINGS_WITH_COMMENT_PATH,


    TRIPLES_PATH_OLD,
    ENTITIES_LABELS_PATH_OLD,
    PROPERTIES_LABELS_PATH_OLD,
    GRAPH_EMBEDDINGS_PATH_OLD,
    QUESTIONS_CONCEPTS_ANSWERS_PATH,
    GRAPH_EMBEDDINGS_PATH_OLD
)

from core.NeuralNet.GNN import GCN

from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
encoder_model = RobertaModel.from_pretrained("roberta-base")
NUM_EPOCHS=1

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
logger.remove()
logger.add(sys.stderr, level="DEBUG")

## CREATE DATA
logger.info("Creating Data object")

qa_data_builder = QADataBuilder(
    triples_path=TRIPLES_PATH,
    entities_labels_path=ENTITIES_LABELS_PATH,
    properties_labels_path=PROPERTIES_LABELS_PATH,
    embeddings_path=GRAPH_EMBEDDINGS_PATH,
    questions_answers_path=QUESTIONS_ANSWERS_PATH,
    questions_embeddings_path=QUESTIONS_EMBEDDINGS_PATH,
)

x = qa_data_builder.get_x()
train_mask, test_mask, val_mask = qa_data_builder.get_questions_masks()
NUM_EPOCHS_PER_QUESTION = int(NUM_EPOCHS / sum(train_mask))

## TRAIN MLP
logger.info("Training MLP")
model = MLP(
    num_node_features=(2 * x.shape[1]), dim_hidden_layer=16, num_classes=2
)  # we multiply x.shape by two so as to account for question embedding
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

for q_index, question_embedding in enumerate(
    qa_data_builder.questions_embeddings_masked(train_mask)
):  # call the questions_iterator from the instance
    question, q_embedding = question_embedding
    q_x = qa_data_builder.get_x(
        to_concat=q_embedding
    )  # add question embedding to node features embedding
    q_y = qa_data_builder.get_y(question=question)
    data = Data(x=q_x, y=q_y)
    if not data.validate():
        logger.error(f"Data object is not valid for question {question}")

    for epoch in range(NUM_EPOCHS_PER_QUESTION):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out, data.y, weight = torch.tensor([0.000001, 1-0.000001]))
        loss.backward()
        optimizer.step()
    logger.debug(
        f"Total Question: {(q_index + 1)}, Total Epochs: {NUM_EPOCHS_PER_QUESTION * (q_index + 1):3d}, Loss: {loss:.4f}"
    )

In [None]:
evaluate_qa_model(model=model, qa_data_builder=qa_data_builder, mask=train_mask)

In [None]:
def _predict_answer(model, data):
    """
    Returns the predicted answer and node index.
    """
    return model(data).max(dim=1)[0].argmax().item()

def evaluate_qa_model(model, qa_data_builder, mask):
    model.eval()
    correct_predictions = 0
    for q_index, question_embedding in enumerate(
        qa_data_builder.questions_embeddings_masked(mask)
    ):
        question, q_embedding = question_embedding
        x_q = qa_data_builder.get_x(
            to_concat=q_embedding
        )  # adding the question embedding to the node embeddings
        y_q = qa_data_builder.get_y(question=question)
        data = Data(x=x_q, edge_index=qa_data_builder.get_edge_index(), y=y_q)
        pred_node_idx = _predict_answer(model, data)
        actual_node_idx = qa_data_builder.get_node_index_for_question_answer(question)
        if pred_node_idx == actual_node_idx:
            logger.debug(f"Correctly predicted answer to question {question}.")
            correct_predictions += 1
        elif pred_node_idx != torch.tensor(0):
            logger.debug(
                f"Question: {question}. Predicted answer = {qa_data_builder.index_to_entity[pred_node_idx]}, Actual answer: {qa_data_builder.index_to_entity[actual_node_idx]}"
            )
        else:
            logger.debug(f"Could not predict any answer")
    return correct_predictions / (q_index + 1)

In [115]:
x=torch.randn(6,5)
edge_index = torch.tensor([[0, 1, 2, 3, 4, 0,5],
                           [1, 2, 3, 4, 0, 3,1]], dtype=torch.long)
edge_type = torch.tensor([0, 1, 0, 2, 1, 3,0], dtype=torch.long)
data = Data(edge_index=edge_index, edge_type=edge_type)

In [116]:
data.edge_index

tensor([[0, 1, 2, 3, 4, 0, 5],
        [1, 2, 3, 4, 0, 3, 1]])

In [117]:
data.edge_type

tensor([0, 1, 0, 2, 1, 3, 0])

In [207]:
question_uri = ["uri_1","uri_2"]
uri_to_index = {"uri_1":8,"uri_2":22,"uri_3":38}

In [208]:
question_nodes = [uri_to_index[uri] for uri in question_uri]
question_nodes

[8, 22]

In [206]:
mask = torch.isin(data.edge_index[0], torch.tensor([0]))
ohp_n = torch.unique(data.edge_index[1, mask])
torch.unique(torch.cat((ohp_n,torch.tensor([0]))))

tensor([0, 1, 3])

In [210]:
question_subgraph_concepts = torch.tensor([0, 1, 2, 3, 4])
question_mask = torch.zeros_like(question_subgraph_concepts, dtype=torch.bool)
question_training_mask = torch.zeros_like(question_subgraph_concepts, dtype=torch.bool)
answer_mask = torch.zeros_like(question_subgraph_concepts, dtype=torch.bool)

# q_mask
question_mask = question_mask | torch.isin(question_subgraph_concepts , torch.tensor([0,4]) )
print ("question_mask: ",question_mask)

question_mask:  tensor([ True, False, False, False,  True])


In [122]:
# question node
q_nodes =  0 # list of entities identified in the question
a_nodes = 4

# 1 hop neighbors for q_node
one_hop_neighbors = data.edge_index[1, data.edge_index[0] == q_node]

# add the q_node to one hop neighbors
one_hop_neighbors_with_q_node = torch.cat((one_hop_neighbors,torch.tensor(q_node).unsqueeze(0)))

In [198]:
one_hop_neighbors

tensor([1, 3])

In [None]:
#subgraph_mask -- To filter the edge_index and edge type attributes of data class for the question
#concepts 
# Below masks to be used by the list concepts.
#answer_mask -- 
#question_mask --
#question_train_mask -- 

In [123]:
subgraph_mask = torch.zeros_like(data.edge_index[0], dtype=torch.bool)
for node in one_hop_neighbors_with_q_node:
    subgraph_mask = subgraph_mask | (data.edge_index[0]==node)

question_subgraph_concepts = torch.unique(data.edge_index[:,subgraph_mask]) # all concepts in the subgraph
print("question_subgraph_concepts: ", question_subgraph_concepts)

# initialize masks: 
question_mask = torch.zeros_like(question_subgraph_concepts, dtype=torch.bool)
question_training_mask = torch.zeros_like(question_subgraph_concepts, dtype=torch.bool)
answer_mask = torch.zeros_like(question_subgraph_concepts, dtype=torch.bool)

# q_mask
question_mask = question_mask | (question_subgraph_concepts == q_nodes )
print ("question_mask: ",question_mask)

#check if answer is present in the question_subgraph_concepts

answer_mask = answer_mask | (question_subgraph_concepts == a_nodes )
print ("answer_mask: ",answer_mask)

# randomly sample n nodes including the answer node and exclude q_node from the question_subgraph_concepts

# Exclude question_mask and answer_mask items
valid_indices = torch.where(~question_mask & ~answer_mask)[0]
n=3
if n < len(valid_indices):
    random_indices  = random.sample(valid_indices.tolist(), n)
    question_training_mask[random_indices] = True
    question_training_mask = question_training_mask | answer_mask
else:
    question_training_mask[~question_mask] = True
print ("question_training_mask: ",question_training_mask)




question_subgraph_concepts:  tensor([0, 1, 2, 3, 4])
question_mask:  tensor([ True, False, False, False, False])
answer_mask:  tensor([False, False, False, False,  True])
question_training_mask:  tensor([False,  True,  True,  True,  True])


In [124]:
data.edge_index

tensor([[0, 1, 2, 3, 4, 0, 5],
        [1, 2, 3, 4, 0, 3, 1]])

In [125]:
data.edge_index[:,subgraph_mask]

tensor([[0, 1, 3, 0],
        [1, 2, 4, 3]])

In [126]:
question_subgraph_concepts[question_training_mask]

tensor([1, 2, 3, 4])

In [128]:
# assume 100 nodes of feature dim 5
data.x = torch.randn(100,5)

In [144]:
# construct the x mask and y label for that masked nodes. the dimension will be same as number of nodes in the entire graph?

q_x_mask = torch.full((data.x.size()[0],),False, dtype=torch.bool)
q_y_label = torch.zeros((data.x.size()[0],), dtype=torch.float32)

In [145]:
q_x_mask

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False])

In [146]:
q_x_mask[question_subgraph_concepts[question_training_mask]] = True
q_x_mask

tensor([False,  True,  True,  True,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False])

In [147]:
q_y_label[question_subgraph_concepts[answer_mask]] = 1
q_y_label

tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [148]:
q_y_label[q_x_mask]

tensor([0., 0., 0., 1.])

In [150]:
# Training loop
'''
1. create a basic data object
2. for each question+embedding+question_node+answer_node:
	a. get all the masks related to the question. # potentially using a method of the QA dataset class
	b. construct the  q_x = qa_data_builder.get_x(to_concat=q_embedding)  # add question embedding to node features embedding
	c. build the Data object for the question with filtered edge types and edge index, data = Data(x=q_x, y=q_y, 
	d. for epoch in range(NUM_EPOCHS_PER_QUESTION):
        	optimizer.zero_grad()
        	out = model(data)
        	loss = F.nll_loss(out, data.y, weight = torch.tensor([0.000001, 1-0.000001]))
        	loss.backward()
        	optimizer.step()
    		logger.debug(f"Total Question: {(q_index + 1)}, Total Epochs: {NUM_EPOCHS_PER_QUESTION * (q_index + 1):3d}, Loss: {loss:.4f}")'''

'\n1. create a basic data object\n2. for each question+embedding+question_node+answer_node:\n\ta. get all the masks related to the question. # potentially using a method of the QA dataset class\n\tb. construct the  q_x = qa_data_builder.get_x(to_concat=q_embedding)  # add question embedding to node features embedding\n\tc. build the Data object for the question with filtered edge types and edge index, data = Data(x=q_x, y=q_y, \n\td. for epoch in range(NUM_EPOCHS_PER_QUESTION):\n        \toptimizer.zero_grad()\n        \tout = model(data)\n        \tloss = F.nll_loss(out, data.y, weight = torch.tensor([0.000001, 1-0.000001]))\n        \tloss.backward()\n        \toptimizer.step()\n    \t\tlogger.debug(f"Total Question: {(q_index + 1)}, Total Epochs: {NUM_EPOCHS_PER_QUESTION * (q_index + 1):3d}, Loss: {loss:.4f}")'

In [183]:
qa_data = pd.DataFrame(columns = ["question","concepts","answers"])
qa_data

Unnamed: 0,question,concepts,answers


In [185]:
# set up a process to read each question from gold standard, get the q_entity, also a_entity in a csv format
qa_data = pd.DataFrame(columns = ["question","concepts","answers"])
with open('data\source_data\patterd_id_0_qa.json', 'r', encoding='utf-8') as fin:
    for line in fin:
        dic = json.loads(line)
        qa_data = pd.concat([qa_data,pd.DataFrame([dic])],ignore_index=True)
qa_data.head()

Unnamed: 0,question,concepts,answers
0,What is the type of Incentive and Commission M...,[http://www.signavio.com/opal/SAP/RSA/SCM/Ince...,[http://www.signavio.com/opal/SAP/RSA/SCM/Solu...
1,What is the type of Incentive and Commission M...,[http://www.signavio.com/opal/SAP/RBA/BCM/Ince...,[http://www.signavio.com/opal/SAP/RSA/SCM/Solu...
2,What is a MFS-50-10-30 Operated JV Operations ...,[http://www.signavio.com/opal/SAP/SSC/BPML/MFS...,[http://www.signavio.com/opal/SAP/SSC/BPML/Pro...
3,What is a MFS-50-10-30 Operated JV Operations ...,[http://www.signavio.com/opal/SAP/SSC/BPML/MFS...,[http://www.signavio.com/opal/SAP/SSC/BPML/Pro...
4,What's an 1EZ - Credit Memo Processing (SPFD-1...,[http://www.signavio.com/opal/SAP/EARL/SAD/1EZ...,[http://www.signavio.com/opal/SAP/EARL/SAD/Sol...


In [195]:
#qa_data.to_csv('data\source_data\questions_concepts_answers.csv',index=False)

## QA Training

In [2]:
roberta_encoder = LLMEncoder(tokenizer, encoder_model)

In [3]:
# Build data
logger.remove()
logger.add(sys.stderr, level="DEBUG")

## CREATE DATA
logger.info("Creating Data object")

qa_data_builder = QAMaskBuilder(
    triples_path=TRIPLES_PATH_OLD,
    entities_labels_path=ENTITIES_LABELS_PATH_OLD,
    properties_labels_path=PROPERTIES_LABELS_PATH_OLD,
    embeddings_path=GRAPH_EMBEDDINGS_PATH_OLD,
    questions_concepts_answers_path=QUESTIONS_CONCEPTS_ANSWERS_PATH,
    
)

[32m2023-08-16 13:14:34.320[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mCreating Data object[0m


In [106]:
data = qa_data_builder.build_data()
data

Data(x=[29025, 768], edge_index=[2, 145402], edge_type=[145402])

In [87]:
# TRAIN GNN
logger.info("Training GNN")
model = GCN(
    num_node_features=q_data.num_node_features, dim_hidden_layer=16,num_layers=2, num_classes=len(set(q_data.y.tolist()))
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

[32m2023-08-16 17:33:08.470[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTraining GNN[0m


GCN(
  (layers): ModuleList(
    (0): GCNConv(1536, 16)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): GCNConv(16, 2)
  )
)

In [88]:
model.train()
for idx, row in qa_data_builder.question_concepts_answers.iterrows():

    q_embedding = roberta_encoder.encode_sentence(row["question"])
    q_x = qa_data_builder.get_x(to_concat=q_embedding)
    q_edge_mask, q_nodes, q_concept_mask, q_answer_mask, q_answer_and_random_nodes_mask =qa_data_builder.get_concepts_and_masks_for_question(question =row["question"], concept_uri= row["concepts"], answer_uri= row["answers"])
    q_edge_index = data.edge_index[:,q_edge_mask]
    q_edge_type = data.edge_type[q_edge_mask]
    q_training_x_mask = qa_data_builder.get_question_training_mask_for_x()
    q_y_labels = qa_data_builder.get_question_y_labels()
    q_data = Data(x=q_x,edge_index=q_edge_index,edge_type=q_edge_type,train_mask =q_training_x_mask,y=q_y_labels)
    print(f'Training for Q {idx} : {row["question"]}')
    for epoch in range(200
        optimizer.zero_grad()
        out,embedding = model(q_data)
        loss = F.nll_loss(out[q_data.train_mask], q_data.y[q_data.train_mask])
        loss.backward()
        optimizer.step()
        if epoch%1000==0:
            logger.debug(f"Epoch: {epoch:03d}, Loss: {loss:.4f}")
    break;
    

[32m2023-08-16 17:33:10.785[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [34m[1mEpoch: 000, Loss: 0.6973[0m


Training for Q 0 : What is the type of Incentive and Commission Management (S/4 CLD)?


[32m2023-08-16 17:34:29.544[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [34m[1mEpoch: 1000, Loss: 0.2068[0m


In [108]:
res = []
for idx, row in qa_data_builder.question_concepts_answers.iterrows():

    q_embedding = roberta_encoder.encode_sentence(row["question"])
    q_x = qa_data_builder.get_x(to_concat=q_embedding)
    q_edge_mask, q_nodes, q_concept_mask, q_answer_mask, q_answer_and_random_nodes_mask =qa_data_builder.get_concepts_and_masks_for_question(question =row["question"], concept_uri= row["concepts"], answer_uri= row["answers"])
    q_edge_index = data.edge_index[:,q_edge_mask]
    q_edge_type = data.edge_type[q_edge_mask]
    q_training_x_mask = qa_data_builder.get_question_training_mask_for_x()
    q_y_labels = qa_data_builder.get_question_y_labels()
    q_data = Data(x=q_x,edge_index=q_edge_index,edge_type=q_edge_type,train_mask =q_training_x_mask,y=q_y_labels)
    model.eval()
    out,_ = model(q_data)
    predicted_answer_nodes = torch.where(out.argmax(dim=1))
    count_predicted_nodes =len(predicted_answer_nodes[0])
    actual_answer_nodes = q_nodes[q_answer_mask].tolist()
    if count_predicted_nodes > 0:
        message="answers predicted"
        predicted_answer_nodes_list = predicted_answer_nodes[0].tolist()
        is_predicted_in_actual_answers = bool(set(actual_answer_nodes) & set(predicted_answer_nodes_list))
        res.append((idx, actual_answer_nodes, predicted_answer_nodes_list,count_predicted_nodes,is_predicted_in_actual_answers))
    
    
    else:
        message="NO answers found"
        res.append((idx, actual_answer_nodes, np.nan,np.nan,False))
    break;

In [109]:
eval_res = pd.DataFrame.from_records(res,columns=["q_idx","actual_answer_nodes","predicted_answer_nodes","count_predicted_nodes","is_predicted_in_actual"])
eval_res

Unnamed: 0,q_idx,actual_answer_nodes,predicted_answer_nodes,count_predicted_nodes,is_predicted_in_actual
0,0,[22581],,,False


In [105]:
data

{'Column1': [1, 2, 3], 'Column2': ['A', 'B', 'C']}

In [19]:
# How do you evaluate the model
q_edge_mask, q_nodes, q_concept_mask, q_answer_mask, q_answer_and_random_nodes_mask
q_edge_index
q_edge_type
q_training_x_mask
q_y_labels
q_data

0

In [99]:
bool(set(q_nodes[q_answer_mask].tolist()) & set([22581,22589,22555]))

True

In [73]:
row

question    What is the type of Incentive and Commission M...
concepts    [http://www.signavio.com/opal/SAP/RSA/SCM/Ince...
answers     [http://www.signavio.com/opal/SAP/RSA/SCM/Solu...
Name: 0, dtype: object

In [71]:
len(q_nodes)

1059

In [72]:
q_nodes[q_answer_mask]

tensor([22581])

In [56]:
qa_data_builder.index_to_entity[22581]

'http://www.signavio.com/opal/SAP/RSA/SCM/SolutionCapability'

In [51]:
sum(q_training_x_mask)

tensor(19)

In [30]:
boolean_array = torch.tensor([True, False, True, False, True, False, False, True])

passed_node_id = torch.where(boolean_array)[0]
passed_node_id

tensor([0, 2, 4, 7])

In [31]:
passed_node_id[3]

tensor(7)

In [84]:
model.eval()
out,_ = model(q_data)

In [85]:
out.size()

torch.Size([29025, 2])

In [86]:
sum(out.argmax(dim=1))

tensor(28122)

In [57]:
qa_data_builder.index_to_entity[23565]

'http://www.signavio.com/opal/SAP/SFSF/AL/Process'

In [60]:
out.max(dim=1)[0]

tensor([-0.6187, -0.6076, -0.6111,  ..., -0.6067, -0.6039, -0.6040],
       grad_fn=<MaxBackward0>)