In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir data
!cp /content/drive/MyDrive/data/jsons/QA.json /content/data

In [None]:
!cp /content/drive/MyDrive/data/knowledge.txt /content/

In [None]:
!wc -l /content/knowledge.txt

32761 /content/knowledge.txt


## ConceptNet

In [None]:
!pip install conceptnet-lite

import conceptnet_lite

conceptnet_lite.connect("/path/to/conceptnet.db")

In [None]:
from conceptnet_lite import Label, edges_for

def query_knowledge(q):
  try:
    d = {}
    for e in edges_for(Label.get(text=q, language='en').concepts, same_language=True):
      # print(e.start.text, "::", e.end.text, "|", e.relation.name, "|", e.etc['weight'])
      d[(e.start.text,e.relation.name,e.end.text)] = e.etc['weight']
      # print(e.etc['weight'])
    kn = []
    for i,f in enumerate(sorted(d.items(), key=lambda item: item[1], reverse=True)):
      # if i == 5:
      #   break
      k = ' '.join(f[0])
      k = k.replace('_', ' ')
      kn.append(k)
      # only one knowledge
      break
    return kn
  except:
    return []


In [None]:
print(query_knowledge('carcinoma'))

['adenocarcinoma is a carcinoma']


## Extraction Method

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def ectract_query(ques):
  qs = nlp(ques)
  queries = []
  for tk in qs:
    # print(tk.text,'|',tk.pos_)
    if (tk.pos_ == 'VERB' or tk.pos_ == 'NOUN' or tk.pos_ == 'ADJ') and tk.lemma_ not in queries:
      queries.append(tk.lemma_)
  return queries


In [None]:
def knowledge_retrieval(queries):
  kns = []
  for q in queries:
    kns += query_knowledge(q)
  return(kns)


# Load questions

In [None]:
import numpy as np
import json
import re
import os


with open('/content/data/QA.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


path = '/content/data/knowledge_embeddings'
if not os.path.exists(path):
    os.mkdir(path)


# Retrieve knowledges

## extract all entities

In [None]:
entities = set()

for qa in data:
  ques = qa['Questions']
  queries = ectract_query(ques)
  for term in queries:
    if term not in entities:
      entities.add(term)

In [None]:
print(len(data))
print(ectract_query(data[-1]['Questions']))
print(len(entities))
print(entities)

32761
Where | ADV
is | AUX
this | DET
? | PUNCT
[]
3618
{'cord', 'colitide', 'device', 'luteum', 'pancreas', 'pancreatic', 'receptor', 'bridge', 'more', 'stratum', 'isomerism', 'condense', 'multilobed', 'vesselwall', 'treat', 'underlying', 'encroach', 'evidence', 'fluorosis', 'triphenyltetrazolium', 'complete', 'confirm', 'oxygen', 'excess', 'head', 'pronounce', 'fibrinopurulent', 'leukomalacia', 'bad', 'strike', 'part', 'adherence', 'subunit', 'related', 'postmortinjection', 'ankle', 'strange', 'birth', 'develop', 'neuronal', 'spontaneous', 'alveolar', 'appearing', 'scab', 'langhan', 'rhabdomyosarcoma', 'umbilical', 'papillitis', 'basal', 'planus', 'pidural', 'displace', 'medial', 'craniopharyngioma', 'degeneration', 'impression', 'prolymphocyte', 'carpal', 'submucous', 'pas', 'pulpal', 'haustral', 'brainstem', 'brain', 'gammopathy', 'stomatitis', 'period', 'initiator', 'toxicity', '7th', 'x', 'aneurysmal', 'disorganize', 'keratohyaline', 'week', 'emaciate', 'migrate', 'magna', 'perio

## retrieve knowledges

In [None]:
d = {}
for e in entities:
  d[e] = query_knowledge(e)

In [None]:
import json

with open('/content/entity.txt', 'w') as file:
    json.dump(d, file)

# with open('C:\temp.txt', 'r') as file:
#     new_d = json.load(file)

In [None]:
kns = knowledge_retrieval(entities)

In [None]:
print(len(kns))
print(len(d))
print(d)

3333
3618
{'cord': ['cord used for tie'], 'colitide': [], 'device': ['stethoscope is a device'], 'luteum': [], 'pancreas': ['pancreas part of human body'], 'pancreatic': ['pancreatic derived from pancreas'], 'receptor': ['alpha receptor is a receptor'], 'bridge': ['bridge related to water'], 'more': ['less antonym more'], 'stratum': ['bed is a stratum'], 'isomerism': ['isomerism is a state'], 'condense': ['pasteurize causes condense'], 'multilobed': ['multilobed derived from lobed'], 'vesselwall': [], 'treat': ['cake related to treat'], 'underlying': ['basic similar to underlying'], 'encroach': ['impinge synonym encroach'], 'evidence': ['evidential related to evidence'], 'fluorosis': ['fluorosis is a pathology'], 'triphenyltetrazolium': [], 'complete': ['finish related to complete'], 'confirm': ['confirmation related to confirm'], 'oxygen': ['air related to oxygen'], 'excess': ['exorbitance is a excess'], 'head': ['head related to neck'], 'pronounce': ['pronounceable derived from prono

In [None]:
ques = data[29408]['Questions']
print(ques)
queries = ectract_query(ques)
print(queries)
kns = []
for q in queries:
  kns += d[q]
print(kns)
kn_concat = ''
for k in kns:
  marked_text1 = k + ","
  kn_concat += marked_text1
print(kn_concat[:-1])
kn_concat = str(ques_id) + ',' + kn_concat[:-1] + '\n'
print(kn_concat)

Is hyalin mass in pituitary which is amyloid there are several slides from this case in this file 23 yowf amyloid limited to brain present?
['mass', 'pituitary', 'amyloid', 'several', 'slide', 'case', 'file', 'yowf', 'limited', 'brain', 'present']
['continent related to mass', 'hypophysis synonym pituitary', 'amyloid has context pathology', 'different similar to several', 'slide at location park', 'case related to brief', 'file at location computer', 'resources receives action limited', 'thinking has prerequisite brain', 'present capable of surprise child']
continent related to mass,hypophysis synonym pituitary,amyloid has context pathology,different similar to several,slide at location park,case related to brief,file at location computer,resources receives action limited,thinking has prerequisite brain,present capable of surprise child
32799,continent related to mass,hypophysis synonym pituitary,amyloid has context pathology,different similar to several,slide at location park,case rel

## Save knowledges

In [None]:
f = open('/content/knowledge.txt','a')

for qa in data:
  ques_id = qa['Question_Id']
  ques = qa['Questions']
  # print(ques)
  queries = ectract_query(ques)
  # print(queries)
  kns = []
  for q in queries:
    kns += d[q]
  # print(kns)
  kn_concat = ''
  for k in kns:
    marked_text1 = k + ","
    kn_concat += marked_text1
  # print(kn_concat[:-1])
  kn_concat = str(ques_id) + ',' + kn_concat[:-1] + '\n'
  # print(kn_concat)
  f.write(kn_concat)
  print("saved file: ", ques_id)
  # test = kn_concat.split(',')
  # print(test)
f.close()

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
saved file:  27786
saved file:  27787
saved file:  27788
saved file:  27789
saved file:  27790
saved file:  27791
saved file:  27792
saved file:  27793
saved file:  27794
saved file:  27795
saved file:  27796
saved file:  27797
saved file:  27798
saved file:  27799
saved file:  27800
saved file:  27801
saved file:  27802
saved file:  27803
saved file:  27804
saved file:  27805
saved file:  27806
saved file:  27807
saved file:  27808
saved file:  27809
saved file:  27810
saved file:  27811
saved file:  27812
saved file:  27813
saved file:  27814
saved file:  27815
saved file:  27816
saved file:  27817
saved file:  27818
saved file:  27819
saved file:  27820
saved file:  27821
saved file:  27822
saved file:  27823
saved file:  27824
saved file:  27825
saved file:  27826
saved file:  27827
saved file:  27828
saved file:  27829
saved file:  27830
saved file:  27831
saved file:  27832
saved file:  27833
saved file:  27834
saved file:  27835
saved fil

In [None]:
# f = open('/content/knowledge.txt','a')

# for qa in data[925:]:
#   ques_id = qa['Question_Id']
#   ques = qa['Questions']    

#   # extract verbs and nouns from question
#   queries = ectract_query(ques)
#   # retrieve knowledge from knowledge base
#   # num: len(queries) * 1
#   kns = knowledge_retrieval(queries)

#   kn_concat = '[CLS]'
#   for k in kns:
#     marked_text1 = ' ' + k + " [SEP]"
#     kn_concat += marked_text1 
#   kn_concat = kn_concat[:-6] + '\n'
#   f.write(kn_concat)
#   print("saved file: ", ques_id)

# f.close()

## save to drive

In [None]:
!cp /content/knowledge.txt /content/drive/MyDrive/data

# Read file

In [None]:
with open('/content/knowledge.txt','r') as file:
  lines = file.readlines()

print(len(lines))

32761


In [None]:
qids = []
for l in lines:
  l_split = l.strip().split(',')
  if l_split[1] == '':
    qids.append(int(l_split[0]))

for q in data:
  if q['Question_Id'] in qids:
    print(q['Question_Id'],q['Questions'])

585 What is AIDS?
1361 What is the farthest?
2308 What is glioblastoma?
2559 What is there?
2583 What is there?
2590 What is there?
2601 What is there?
2988 What is 'chancre ' on glans penis?
3319 What is there?
3325 What is there?
3410 What are there?
3418 What are there?
3432 What is there?
3441 What is there?
3533 What is there?
3540 What is there?
3604 What are there ?
3662 What is there?
3666 What is there?
3714 What are there?
3942 What is there?
3950 What is there?
3954 What is there?
4263 Where is there?
4350 What is there?
4422 Are glomeruli normocellular?
4433 What is there of the GBM?
4679 What are there?
4811 What does specimen of the uterus, cervix and adnexa show?
4942 What is there?
5057 What does the hemimaxillectomy specimen show?
5265 What is tan and haemorrhagic?
7314 Where is this?
7326 Where is this?
7338 Where is this?
7350 Where is this?
7363 Where is this?
7375 Where is this?
7390 Where is this?
7406 Where is this?
7422 Where is this?
7439 Where is this?
7449 Wh

In [None]:
for qa in data:
  if qa['Question_Id'] == 29409:
    ques = qa['Questions']
    print(ques)
    queries = ectract_query(ques)
    print(queries)
    kns = []
    for q in queries:
      kns += d[q]
    print(kns)
    kn_concat = ''
    for k in kns:
      marked_text1 = k + ","
      kn_concat += marked_text1
    print(kn_concat[:-1])
    kn_concat = str(ques_id) + ',' + kn_concat[:-1] + '\n'
    print(kn_concat)

Does di george syndrome show islet cell carcinoma?
Does | AUX
di | X
george | PROPN
syndrome | PROPN
show | PROPN
islet | PROPN
cell | PROPN
carcinoma | PROPN
? | PUNCT
[]
[]

32799,



In [None]:
!cat /content/knowledge.txt

1,charge related to credit,allow related to permit,densification synonym compaction,dna is a molecule2,histone is a simple protein,subunit synonym fractional monetary unit,charge related to credit3,histone is a simple protein,subunit synonym fractional monetary unit,charge related to credit,allow related to permit,densification synonym compaction,dna is a molecule4,liver part of body,apple related to stem,cell related to phone,egg related to oval,examine thing has prerequisite locate5,stain at location rug,immunohistochemically derived from immunohistochemical6,bilious related to bile,air intake is a duct,cell related to phone,canal related to water,stain at location rug7,bilious related to bile,air intake is a duct,cell related to phone,canal related to water,stain at location rug,immunohistochemically derived from immunohistochemical8,tell story has subevent illustrate9,principal at location school,cancellated similar to cellular,accommodation is a alteration,characterization related

In [None]:
!cat /content/drive/MyDrive/data/knowledge.txt