# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ConceptNet Embedding

## node2id

In [None]:
!cp /content/drive/MyDrive/conceptnet_data.zip /content
!unzip /content/conceptnet_data.zip -d /content

In [None]:
!git clone https://github.com/allenai/commonsense-kg-completion.git

In [None]:
%%bash
cd /content/commonsense-kg-completion
pip install -r requirements.txt

In [None]:
%cd /content/commonsense-kg-completion/src

/content/commonsense-kg-completion/src


In [None]:
from collections import Counter
import argparse
import numpy as np
import sys
import os
import json
import time
import random

import torch
import torch.nn as nn

from model import LinkPredictor
from reader import AtomicTSVReader, ConceptNetTSVReader, FB15kReader
import utils
import reader_utils
import evaluation_utils

def load_data(dataset, reader_cls, data_dir, sim_relations):
    train_network = reader_cls(dataset)
    dev_network = reader_cls(dataset)
    test_network = reader_cls(dataset)

    train_network.read_network(data_dir=data_dir, split="train")
    train_network.print_summary()
    node_list = train_network.graph.iter_nodes()
    node_degrees = [node.get_degree() for node in node_list]
    degree_counter = Counter(node_degrees)
    avg_degree = sum([k * v for k, v in degree_counter.items()]) / sum([v for k, v in degree_counter.items()])
    print("Average Degree: ", avg_degree)

    dev_network.read_network(data_dir=data_dir, split="valid", train_network=train_network)
    test_network.read_network(data_dir=data_dir, split="test", train_network=train_network)

    word_vocab = train_network.graph.node2id

    # Add sim nodes
    if sim_relations:
        print("Adding sim edges..")
        train_network.add_sim_edges_bert()

    train_data, _ = reader_utils.prepare_batch_dgl(word_vocab, train_network, train_network)
    test_data, test_labels = reader_utils.prepare_batch_dgl(word_vocab, test_network, train_network)
    valid_data, valid_labels = reader_utils.prepare_batch_dgl(word_vocab, dev_network, train_network)

    return train_data, valid_data, test_data, valid_labels, test_labels, train_network
# python src/run_kbc_subgraph.py --dataset conceptnet --sim_relations --bert_concat --use_bias --load_model {PATH_TO_PRETRAINED_MODEL} --eval_only --write_results
train_data, valid_data, test_data, valid_labels, test_labels, train_network = load_data("conceptnet",
                                                ConceptNetTSVReader,
                                                '/content/data/ConceptNet',
                                                False)

Number of edges: 99999

Graph Summary

Nodes: 78088
Edges: 100000
Relations: 34
Density: 0.000016

******************* Sample Edges *******************
ReceivesAction: hockey --> play on ice
IsA: hockey --> team sport
IsA: hockey --> violent sport
IsA: hockey --> game
IsA: hockey --> great sport
HasProperty: hockey --> violent
HasProperty: hockey --> cold
IsA: hockey --> type of game
IsA: hockey --> sport of skill and precision
IsA: hockey --> sport game
***************** ***************** *****************

Average Degree:  1.254213195369327


In [None]:
len(train_network.graph.node2id)

78334

In [None]:
train_network.graph.node2id[' ']

KeyError: ignored

## Load

In [None]:
!cp /content/drive/MyDrive/bert_model_embeddings.zip /content
!unzip /content/bert_model_embeddings.zip -d /content

In [None]:
import torch
ce = torch.load('/content/nodes-lm-conceptnet/conceptnet_bert_embeddings.pt',map_location=torch.device('cpu'))

In [None]:
ce = ce.numpy()
print(ce.shape)

(78334, 1024)


In [None]:
# !cat /content/data/ConceptNet/cn_node_names.txt | wc -l

78249


In [None]:
# with open('/content/data/ConceptNet/cn_node_names.txt') as f:
#   lines = f.readlines()

In [None]:
# if 'cystadenocarcinoma\n' in lines:
#   print('yes')

# for (i,l) in enumerate(lines):
#   if 'cystadenocarcinoma\n' == l:
#     print(i)

yes
35600


# Generate embedding

In [None]:
!cp /content/drive/MyDrive/data/jsons/QA.json /content

In [None]:
import numpy as np
import json
import re
import os


with open('/content/QA.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


path = '/content/knowledge_embeddings'
if not os.path.exists(path):
    os.mkdir(path)


In [None]:
len(data)
# data[27814]

32761

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
train_network.graph.node2id['stain']

2172

In [None]:
max_len = 0
for d in data[:1]:
  q_id = d['Question_Id']
  qus = d['Questions']
  print('Question: ',qus)
  # print('Nodes: ')
  qus = nlp(qus)
  kn = []
  tks = set()
  for tk in qus:
    # print(tk.lemma_)
    if tk.lemma_ not in tks:
      tks.add(tk.lemma_)
      try:
        id = train_network.graph.node2id[tk.lemma_]
        # print(tk.lemma_)
        kn.append(ce[id])
      except:
        None
  kn = np.array(kn)
  if kn.shape[0] > max_len:
    max_len = kn.shape[0]
  np.save(os.path.join(path, str(q_id)+'.npy'), kn) 
  # print("Shape of KG embedding:", kn.shape)
  # print(q_id,qus)
  print("saved file: ", q_id, kn.shape)



Question:  What are positively charged,  thus allowing the compaction of the negatively charged DNA?
Nodes: 
what
be
charge
allow
the
dna
Shape of KG embedding: (6, 1024)


In [None]:
max_len

22

In [None]:
!ls /content/knowledge_embeddings | wc -l

32761


In [None]:
%cd /content

/content


In [None]:
!zip -r kg_embs.zip knowledge_embeddings/

In [None]:
!unzip /content/kg_embs.zip -d /content/test

In [None]:
!ls /content/test/knowledge_embeddings | wc -l

32761


In [None]:
!cp /content/kg_embs.zip /content/drive/MyDrive/data