In [None]:
import numpy as np
import os
import pandas as pd
import time
import matplotlib.pyplot as plt

## Freebase 2M subset

In [None]:
data_path = '../data/SimpleQuestions_v2/freebase-subsets/freebase-FB2M.txt'

start = time.time()
df = pd.read_table(data_path, sep="\t", header=None, names=["subject", "relation", "object"])
finish = time.time()
print("time taken: {}s".format(finish-start))

In [None]:
df.describe()

In [None]:
print('Number of unique queries that can be answered: ', len(set(df['subject']+df['object']+df['relation'])))

### Create Inverted Index

In [None]:
import pickle
from collections import defaultdict
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

_tokenizer = RegexpTokenizer(r'\w+')

In [None]:
def create_ngrams(text):

    n_grams = list() 
    for i in range(1, len(text)+1):
        n_gram = ngrams(text,i)

        for gram in n_gram:
            n_grams.append(' '.join(gram))

    return n_grams

In [None]:
# load dictionary of mid -> list of entity names sharing the mid
with open('../data/mid2ent.pkl','rb') as file:
    mid2ent = pickle.load(file)

In [None]:
# Inverted Index of entity name to from ngram of entity name
# to entities sharing this ngram to their names or exact match

inv_index = defaultdict(list)
i=0
for k,v in mid2ent.items():
    ngr  = create_ngrams(_tokenizer.tokenize(v[0]))

    for gram in ngr:
        vectorizer = TfidfVectorizer(ngram_range=(len(gram.split()),len(gram.split())),token_pattern='(?u)\\b\\w+\\b')
        X = vectorizer.fit_transform(v)
        features = vectorizer.get_feature_names()
        X_array = X.toarray()
        score =X_array[0][features.index(gram)]
        inv_index[gram].append((k,v,score))
        
    if i % 1000 == 0:
        print(i)
    i +=1

In [None]:
with open('../data/inverted_index.pkl','wb') as handle:
    pickle.dump(inv_index,handle)