<a href="https://colab.research.google.com/github/meti-94/OpenQA/blob/main/Freebase_Reverb_Unification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers -q
!pip install fuzzywuzzy -q
!pip install python-Levenshtein -q

[K     |████████████████████████████████| 3.1 MB 15.6 MB/s 
[K     |████████████████████████████████| 895 kB 37.8 MB/s 
[K     |████████████████████████████████| 596 kB 40.9 MB/s 
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 33.5 MB/s 
[K     |████████████████████████████████| 50 kB 5.5 MB/s 
[?25h  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone


In [None]:
!git clone https://github.com/castorini/BuboQA.git
%cd /content/BuboQA
!bash setup.sh 

In [None]:
cp -r /content/BuboQA/indexes /content/drive/MyDrive

In [None]:
cp -r /content/BuboQA/data/processed_simplequestions_dataset /content/drive/MyDrive

In [None]:
%cd /content/drive/MyDrive/data_freebase
!unzip /content/drive/MyDrive/data_freebase/reverb_linked.zip

/content/drive/MyDrive/data_freebase
Archive:  /content/drive/MyDrive/data_freebase/reverb_linked.zip
  inflating: reverb_linked.csv       


In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
from fuzzywuzzy import fuzz

In [2]:
# mapping between MIDs and names in the form of dict['MID']=['str1', 'str2', ...,  'strN']
with open('/content/drive/MyDrive/indexes/names_2M.pkl', 'rb') as f:
    mid2name = pickle.load(f)

In [3]:
# In/Out degree for each MID in the form of dict['MID']=[In degree, Out degree]
with open('/content/drive/MyDrive/indexes/degrees_2M.pkl', 'rb') as f:
    degrees_2M = pickle.load(f)

In [4]:
# reverse mapping between string and MID in the form of dict['string']=[('MID', 'actual string', 'freebase type') ...] 
with open('/content/drive/MyDrive/indexes/entity_2M.pkl', 'rb') as f:
    entity_2M = pickle.load(f)

In [5]:
# mapping between MIDs and Relations in the form of dict['MID']=[{'fb:common.topic.notable_types', 'fb:people.person.gender', 'fb:people.person.profession'}]
with open('/content/drive/MyDrive/indexes/reachability_2M.pkl', 'rb') as f:
    reachability_2M = pickle.load(f)

In [6]:
reverb2freebace = pd.read_csv('/content/drive/MyDrive/data_freebase/reverb_linked.csv')
reverb2freebace['freebase_ID_argument1'] = reverb2freebace['freebase_ID_argument1'].apply(lambda string:'fb:m.'+str(string))
reverb2freebace['conf'] = reverb2freebace['conf'].astype(float)

In [7]:
matched_mids, unmatched_mids, added_entity_strings = 0, 0, 0
string_count = sum([len(name) for name in entity_2M.values()])


for index, row in tqdm(reverb2freebace.iterrows(), total=reverb2freebace.shape[0]):
  if row['freebase_ID_argument1'] in mid2name:
    mid1 = mid = row['freebase_ID_argument1'] 
    matched_mids+=1
  else:
    mid1 = mid = row['argument1_uuid']
    unmatched_mids+=1
  mid2 = mid = row['argument2_uuid']
  unmatched_mids+=1
  reverb_string1 = str(row['arg1']).lower()
  reverb_string2 = str(row['arg2']).lower()
  relation = row['rel']
  conf = str(row['conf'])
  linking = str(row['link_score'])
  try:
    temp = entity_2M[reverb_string1]
  except:
    entity_2M[reverb_string1] = set()
    temp = entity_2M[reverb_string1]
    added_entity_strings+=1
  temp.add((mid1, reverb_string1, conf))
  try:
    temp = entity_2M[reverb_string2]
  except:
    entity_2M[reverb_string2] = set()
    temp = entity_2M[reverb_string2]
    added_entity_strings+=1
  temp.add((mid2, reverb_string2, conf))
  

print(f'\nTotal MIDs before augmentation: {len(mid2name)}\tUnmatched (Added) MIDs: {unmatched_mids}\t Matched MIDs: {matched_mids}')
print(f'Total Entity Strings before augmentation: {string_count}\tAdded Entity Strings: {added_entity_strings}') 


100%|██████████| 407267/407267 [00:53<00:00, 7632.27it/s]


Total MIDs before augmentation: 1951909	Unmatched (Added) MIDs: 669962	 Matched MIDs: 144572
Total Entity Strings before augmentation: 17555942	Added Entity Strings: 138167





In [8]:
def get_ngram(text):
    ngram = []
    tokens = str(text).split()
    for i in range(len(tokens)+1):
        for j in range(i):
            if i-j <= 3:
                temp = " ".join(tokens[j:i])
                if temp not in ngram:
                    ngram.append(temp)
    ngram = sorted(ngram, key=lambda x: len(x.split()), reverse=True)
    return ngram

In [9]:
def get_stat_inverted_index(reverse_index):
    """
    Get the number of entry and max length of the entry (How many mid in an entry)
    """
    with open(filename, "rb") as handler:
        global  inverted_index
        inverted_index = pickle.load(handler)
        inverted_index = defaultdict(str, inverted_index)
    print("Total type of text: {}".format(len(inverted_index)))
    max_len = 0
    _entry = ""
    for entry, value in inverted_index.items():
        if len(value) > max_len:
            max_len = len(value)
            _entry = entry
    print("Max Length of entry is {}, text is {}".format(max_len, _entry))

In [10]:
from collections import defaultdict
inverted_index = defaultdict(str, entity_2M)
print("Total type of text: {}".format(len(inverted_index)))
max_len = 0
_entry = ""
for entry, value in inverted_index.items():
  if len(value) > max_len:
    max_len = len(value)
    _entry = list(value)
print("Max Length of entry is {}, text is {}".format(max_len, _entry[:100]))

Total type of text: 4936783
Max Length of entry is 249717, text is [('fb:m.0vcp9', 'forsyth township , michigan', 'fb:common.topic.alias'), ('fb:m.01qctqj', 'wesley brown , scott', 'fb:common.topic.alias'), ('fb:m.0cdx_x', 'douglas houghton , baron houghton of sowerby', 'fb:type.object.name'), ('fb:m.015701', 'coahuila , torreón', 'fb:common.topic.alias'), ('fb:m.01qzdpj', 'j : son lindh , björn', 'fb:common.topic.alias'), ('fb:m.04ckn4d', 'symphonie no. 2 c-moll , symphonie no. 10 fis-moll ( royal concertgebouw orchestra feat. conductor : bernard haitink ) ( disc 2 )', 'fb:common.topic.alias'), ('fb:m.0bcrlj', 'belmont , dallas', 'fb:common.topic.alias'), ('fb:m.03g15zf', "collection , volume 18 : j'ai oublié de vivre : 1977", 'fb:type.object.name'), ('fb:m.06gzp7', 'adams , george', 'fb:common.topic.alias'), ('fb:m.03hh10b', 'haysville , indiana', 'fb:common.topic.alias'), ('fb:m.0dd93_', 'kakatiya university , main campus', 'fb:common.topic.alias'), ('fb:m.04bqtpp', 'dantzler , sout

In [11]:
del entity_2M
del degrees_2M
del reachability_2M
del mid2name

In [12]:
# reading step-by-step output
test_df = pd.read_excel('/content/drive/MyDrive/data_freebase/sbs.xlsx')
questions_fact = reverb2freebace.merge(test_df, how='inner', left_on='reverb_no', right_on='Reverb_no')
questions_fact.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5003 entries, 0 to 5002
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ExID                   5003 non-null   int64  
 1   arg1                   5003 non-null   object 
 2   rel                    5003 non-null   object 
 3   arg2                   5003 non-null   object 
 4   narg1                  5003 non-null   object 
 5   nrel                   5003 non-null   object 
 6   narg2                  5003 non-null   object 
 7   csents                 5003 non-null   int64  
 8   conf                   5003 non-null   float64
 9   urls                   5003 non-null   object 
 10  reverb_no              5003 non-null   int64  
 11  argument1              1924 non-null   object 
 12  relation_phrase        1924 non-null   object 
 13  argument2              1924 non-null   object 
 14  freebase_ID_argument1  5003 non-null   object 
 15  free

In [17]:
# reading freebase questions 
freebase = pd.read_excel('/content/drive/MyDrive/data_freebase/test_useful_records.xlsx')


In [19]:
golds = []
for idx, row in questions_fact.iterrows():
  if row['freebase_ID_argument1']=='fb:m.nan':
    ans_ent = row['answer_entity']
    golds.append(row[f'argument{ans_ent}_uuid'])
  else:
    golds.append(row['freebase_ID_argument1'])
predicteds = questions_fact.node.astype(str).to_list()

golds+=freebase.Answer.to_list()
predicteds+=freebase.entity.to_list()

In [20]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import sys
# hits = []
def entity_linking(data_type, predicteds, golds, HITS_TOP_ENTITIES, output):
    stopword = set(stopwords.words('english'))
    fout = open(output, 'w')
    total = 0
    top1 = 0
    top3 = 0
    top5 = 0
    top10 = 0
    top20 = 0
    top50 = 0
    top100 = 0
    hits = []
    candidates = []
    for idx, (predicted, gold_id) in tqdm(enumerate(zip(predicteds, golds))):
        bflag = True
        total += 1
        C = []
        C_scored = []
        tokens = get_ngram(predicted)
        # print(tokens)
        if len(tokens) > 0:
            maxlen = len(tokens[0].split())
            # print(maxlen)
        for item in tokens:
            if len(item.split()) < maxlen and len(C) == 0:
                maxlen = len(item.split())
            if len(item.split()) < maxlen and len(C) > 0:
                break
            if item in stopword:
                # print('his is stopword', item)
                continue
            C.extend(inverted_index[item])
            # print(inverted_index[item])
        for mid_text_type in set(C):
            score = fuzz.ratio(mid_text_type[1], predicted.strip()) / 100.0
            C_scored.append((mid_text_type, score))
        C_scored.sort(key=lambda t: t[1], reverse=True)
        # print(C_scored[:100])
        # sys.exit()
        candidates.append(C_scored[:100])
        cand_mids = C_scored[:HITS_TOP_ENTITIES]
        for mid_text_type, score in cand_mids:
            fout.write(" %%%% {}\t{}\t{}\t{}".format(mid_text_type[0], mid_text_type[1], mid_text_type[2], score))
        fout.write('\n')
        
        midList = [x[0][0] for x in cand_mids]
        if gold_id in midList[:1]:
            top1 += 1
            if bflag:
              hits.append(1)
              bflag=False
        if gold_id in midList[:3]:
            top3 += 1
            if bflag:
              hits.append(3)
              bflag=False            
        if gold_id in midList[:5]:
            top5 += 1
            if bflag:
              hits.append(5)
              bflag=False
        if gold_id in midList[:10]:
            top10 += 1
            if bflag:
              hits.append(10)
              bflag=False
        if gold_id in midList[:20]:
            top20 += 1
            if bflag:
              hits.append(20)
              bflag=False
        if gold_id in midList[:50]:
            top50 += 1
            if bflag:
              hits.append(50)
              bflag=False
        if gold_id in midList[:100]:
            top100 += 1
            if bflag:
              hits.append(100)
              bflag=False
        if bflag:
          hits.append(-1)
          bflag=False

    print(data_type)
    print("Top1 Entity Linking Accuracy: {}".format(top1 / total))
    print("Top3 Entity Linking Accuracy: {}".format(top3 / total))
    print("Top5 Entity Linking Accuracy: {}".format(top5 / total))
    print("Top10 Entity Linking Accuracy: {}".format(top10 / total))
    print("Top20 Entity Linking Accuracy: {}".format(top20 / total))
    print("Top50 Entity Linking Accuracy: {}".format(top50 / total))
    print("Top100 Entity Linking Accuracy: {}".format(top100 / total))
    return hits, candidates

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
hits, candidates = entity_linking('test', predicteds, golds, 100, "./results")

26622it [00:59, 445.43it/s]

test
Top1 Entity Linking Accuracy: 0.5998797986627601
Top3 Entity Linking Accuracy: 0.7462249267523101
Top5 Entity Linking Accuracy: 0.7913755540530388
Top10 Entity Linking Accuracy: 0.8332206445796709
Top20 Entity Linking Accuracy: 0.8627450980392157
Top50 Entity Linking Accuracy: 0.8866351138156412
Top100 Entity Linking Accuracy: 0.9007212080234392





In [None]:
questions_fact['hit'] = hits
questions_fact['candidates'] = candidates

In [None]:

questions_fact[['arg1', 'rel', 'arg2', 
                'freebase_ID_argument1', 'freebase_entity_name',
                'questions', 'Answer', 'node', 'Sentecne', 'answer_entity', 'hit', 'candidates']].to_csv('/content/debug.csv')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
questions_fact.to_csv('/content/debug.csv')