In [None]:
import pickle
import json
import gzip 
import time
import gc
import math
import re
import nltk

import numpy as np
import pandas as pd

from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords

import spacy
#!python3 -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

### Load dictionaries wiki2mid, mid2entity and entity2mid

In [None]:
# wikipedia url -> MID
wiki_url2fb_id = dict()
with open('../data/freebase_id_wiki_url.en_fixed') as f:
        for line in f.readlines():
            fb_id = line.split()[0]
            wiki_url = line.split()[1]
            wiki_url2fb_id [wiki_url] = fb_id 

In [None]:
# contains only cases within SimpleQuestions (was an assumption of the paper)
# however more general mid2entity (for all the FB) could be used 
# and achieve better results

with open('../data/' + 'mid2entity_list_simple' + '.pkl', 'rb') as f:
    mid2entity = pickle.load(f)
    
with open('../data/' + 'entity2mid_list_simple' + '.pkl', 'rb') as f:
    entity2mid = pickle.load(f)

### Define methods

In [None]:
def uniquify_list(l):
    set_of_tuples = set(tuple(row) for row in l)
    list_of_list = [list(item) for item in set(tuple(row) for row in set_of_tuples)]
    return list_of_list

def get_positions(df_entities, fb_id):
    start_positions = []
    end_positions = []
    for ent in df_entities:
        if fb_id == ent['id_str']:
            start_positions.append(ent['start_position'])
            end_positions.append(ent['end_position'])
            
    return start_positions, end_positions

def get_positions(df_entities):
    start_positions = []
    end_positions = []
    mids = []
    for ent in df_entities:
        if ent['id_str'] in mid2entity:
            start_positions.append(ent['start_position'])
            end_positions.append(ent['end_position'])
            mids.append([ent['id_str'], ent['raw_form'],ent['type']])
            
            
    return start_positions, end_positions, mids

def get_positions_text(df_tokens, label):
    # when the fb_id in the json is none
    # label is the wiki title en.wikipedia.org/wiki/Željko_Obradović -> Željko_Obradović
    start_positions = []
    end_positions = []

    pos =[]
    sentences = [tok['raw_form'].lower() for tok in df_tokens]
    wiki_label = [i.lower() for i in label.split('_') if i not in stopWords]
    for word in wiki_label:
        pos.append( [p for p,v in enumerate(sentences) if word ==v] )
    pos_flatten = [pos_ for pos_list in pos for pos_ in pos_list]     
    return pos_flatten, pos_flatten

In [None]:
def find_sentence(df_tokens, start_pos, end_pos):
    
    cont = True
    pos = start_pos
    sentence_backward = []

    #
    counter = 0 # because it is possible that the word we want appears at the end of the sentence so the first if wouldnt work
    while cont:
        if 'section_break' in df_tokens[pos].keys() and counter != 0:
            cont = False
        else:
            sentence_backward.append(df_tokens[pos]['raw_form'])
            if pos == 0:
                cont = False
            else:
                pos -= 1
        counter += 1
    sentence_backward.reverse()
    
    #
    cont = True
    pos = end_pos
    sentence_forward = []

    while (cont and pos< len(df_tokens) ):
        if 'section_break' in df_tokens[pos].keys():
            sentence_forward.append(df_tokens[pos]['raw_form'])
            cont = False
        else:
            sentence_forward.append(df_tokens[pos]['raw_form'])
            pos += 1

    
    if start_pos == end_pos:
        sentence_forward = sentence_forward[1:]
    elif (end_pos-start_pos) > 1 :
        words_in_between = [df_tokens[i]['raw_form'] for i in range(start_pos+1, end_pos)]
        sentence_forward = words_in_between + sentence_forward
        
    
    sentence =  sentence_backward + sentence_forward   

    return sentence

def find_sentences (df_tokens, start_positions, end_positions, mids):
    sentences = []
    occurrences_num = len(start_positions) 

    for oc in range(occurrences_num):
        start_pos = start_positions[oc]
        end_pos = end_positions[oc]
        
        sentences.append(find_sentence(df_tokens, start_pos, end_pos)+[mids[oc]])
    
    # uniquify list
    #sentences = uniquify_list(sentences)
    
    return sentences

## Run

In [None]:
fb_id2sentences = dict()
fb_id2sentences_wikiurl =dict()
fb_id2sentences_ent2mid =dict()

#mids = list(mid2entity.keys())
path_pickle = '../data/DAWT/part00/'
for i in range (1,41):
    input_file = '../data/DAWT/part00/split_{}'.format(i)
    df = pd.read_json(input_file, lines=True)
    fb_ids = [w_id.split(':')[3] for w_id in df['id']]
    label = [w_id.split(':')[4] for w_id in df['id']]
    print('---------------------------------------------',i,'--------------------------------------------------')
    
    for pos, fb_id in enumerate(fb_ids):
        if fb_id in mid2entity:
            df_entities = df['entities'][pos]
            df_tokens = df['tokens'][pos]

            if isinstance(df_entities, list):
                start_positions, end_positions, mids = get_positions(df_entities)#get_positions(df_entities, fb_id)
                sentences  = find_sentences (df_tokens, start_positions, end_positions, mids)
            else:
                #print('nan')
                sentences = []

            # add to dictionary
            fb_id2sentences[fb_id] = sentences
            
        elif 'https://en.wikipedia.org/wiki/'+label[pos] in wiki_url2fb_id:
            fb_id = wiki_url2fb_id['https://en.wikipedia.org/wiki/'+label[pos]]
            
            if fb_id in mid2entity:
                df_entities = df['entities'][pos]
                df_tokens = df['tokens'][pos]

                if isinstance(df_entities, list):
                    start_positions, end_positions, mids = get_positions(df_entities)#get_positions(df_entities, fb_id)
                    sentences  = find_sentences (df_tokens, start_positions, end_positions, mids)

                    if len(sentences)<1 :
                        start_positions, end_positions, mids = get_positions(df_entities)#get_positions_text(df_tokens, label[pos])
                        sentences  = find_sentences (df_tokens, start_positions, end_positions, mids)

                else:
                    sentences = []

                # add to dictionary
                fb_id2sentences_wikiurl[fb_id] = sentences
                
        elif ' '.join(label[pos].lower().split('_')) in entity2mid:
            fb_id = entity2mid[' '.join(label[pos].lower().split('_'))]
            
            df_entities = df['entities'][pos]
            df_tokens = df['tokens'][pos]

            if isinstance(df_entities, list):
                start_positions, end_positions, mids = get_positions(df_entities)#get_positions_text(df_tokens, label[pos])
                sentences  = find_sentences (df_tokens, start_positions, end_positions, mids)
            else:
                sentences = []

            # add to dictionary
            for mid in fb_id: 
                fb_id2sentences_ent2mid[mid] = sentences
    df = None
    gc.collect()

In [None]:
# save dictionary to pickle file
with open(path_pickle+'fb_id2sentences_new.pickle', 'wb') as handle:
    pickle.dump(fb_id2sentences, handle)

with open(path_pickle+'fb_id2sentences_wikiurl.pickle', 'wb') as handle:
    pickle.dump(fb_id2sentences_wikiurl, handle)

with open(path_pickle+'fb_id2sentences_ent2mid.pickle', 'wb') as handle:
    pickle.dump(fb_id2sentences_ent2mid, handle)

## Note:
To create the final fb_id2sentences_idstr_label_type just combine the 3 fb_id2sentences_ent2mid into a single dictionary