<a href="https://colab.research.google.com/github/meti-94/OpenQA/blob/main/reverbrels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install fuzzywuzzy -q
!pip install pattern -q

[K     |████████████████████████████████| 22.2 MB 61.8 MB/s 
[K     |████████████████████████████████| 87 kB 6.2 MB/s 
[K     |████████████████████████████████| 81 kB 9.4 MB/s 
[K     |████████████████████████████████| 5.6 MB 46.5 MB/s 
[K     |████████████████████████████████| 5.6 MB 41.5 MB/s 
[K     |████████████████████████████████| 419 kB 45.8 MB/s 
[K     |████████████████████████████████| 97 kB 7.0 MB/s 
[K     |████████████████████████████████| 3.6 MB 38.5 MB/s 
[?25h  Building wheel for pattern (setup.py) ... [?25l[?25hdone
  Building wheel for mysqlclient (setup.py) ... [?25l[?25hdone
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [7]:
def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc
    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

In [9]:
import pandas as pd
import networkx as nx
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PAST
import sys


#### pattern python>=3.7 compatibility problem
def pattern_stopiteration_workaround():
    try:
        print(lexeme('gave'))
    except:
        pass
pattern_stopiteration_workaround()


class ReverbKnowledgeBase:
	def __init__(self, path='../data/reverb_wikipedia_tuples-1.1.txt'):
		super().__init__()
		df = pd.read_csv(path, sep='\t', header=None)
		reverb_columns_name = ['ExID', 'arg1', 'rel', 'arg2', 'narg1', 'nrel', 'narg2', 'csents', 'conf', 'urls']
		df.columns = reverb_columns_name
		df = df.dropna()
		df = df.drop_duplicates()
		self.KB = df
		self.is_facts = self.KB[(self.KB.rel.apply(lambda rg:rg.find('is ')!=-1))|(self.KB.rel.apply(lambda rg:rg.find('Is ')!=-1))]
		self.nodes = self.KB['arg1'].to_list()+self.KB['arg2'].to_list()
		self.edges = self.KB['rel'].to_list()
		self.nodes_vectorizer = TfidfVectorizer()
		self.edges_vectorizer = TfidfVectorizer()
		self.nodes_tfidf = self.nodes_vectorizer.fit_transform(self.nodes)
		self.edges_tfidf = self.edges_vectorizer.fit_transform(self.edges)
		self.relations = {}
		for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Indexing ...'):
			if row['rel'] in self.relations:
				self.relations[row['rel']].append((row['arg1'], index, row['conf']))
				self.relations[row['rel']].append((row['arg2'], index, row['conf']))
			else:
				self.relations[row['rel']] = [(row['arg1'], index, row['conf'])]
				self.relations[row['rel']].append((row['arg2'], index, row['conf']))
		


	def tfidf_nodes_query(self, search_phrase, cutoff=50):
		similarities = get_tf_idf_query_similarity(self.nodes_vectorizer, self.nodes_tfidf, search_phrase)
		ranks = {k:v for k,v in zip(self.nodes, similarities)}
		sorted_ranks = {k: v for k, v in sorted(ranks.items(), key=lambda item:item[1], reverse=True)[:min(len(ranks), cutoff)]}

		return sorted_ranks

	def tfidf_edges_query(self, search_phrase, cutoff=50):
		similarities = get_tf_idf_query_similarity(self.edges_vectorizer, self.edges_tfidf, search_phrase)
		ranks = {k:v for k,v in zip(self.edges, similarities)}
		sorted_ranks = {k: v for k, v in sorted(ranks.items(), key=lambda item:item[1], reverse=True)[:min(len(ranks), cutoff)]}
		return sorted_ranks

	def tfidf_query(self, node='Bill Gates', edge='Born'):
		# print(edge)
		edge_list = edge.split()
		if len(edge_list)>=2 and edge_list[0]=='did':
			edge_list[1] = conjugate(verb=edge_list[1],tense=PAST)
			edge = ' '.join(edge_list[1:])
		else:
			edge = ' '.join(edge_list)
		# print(edge)
		if edge.strip()!='is':
			nodes = self.tfidf_nodes_query(node)
			edges = self.tfidf_edges_query(edge)
			pruned = []
			for node in nodes.keys():
				for edge in edges.keys():
					for item in self.relations[edge]:
						if item[0]==node:
							pruned.append((item[1], item[-1], nodes[node], edges[edge]))
			sorted_pruned = sorted(pruned, key=lambda x:x[2]+x[3], reverse=True)
			return sorted_pruned[:min(len(sorted_pruned), 100)]
		else:
			nodes = self.tfidf_nodes_query(node)
			arg1 = self.KB.loc[self.KB['arg1'].isin(nodes.keys())]
			arg2 = self.KB.loc[self.KB['arg2'].isin(nodes.keys())]
			# print(self.KB.loc[self.KB['arg2'].isin(nodes.keys())][:10])
			
			pruned = []
			for node, similarity in nodes.items():
				for idx, row in arg1.loc[arg1['arg1']==node].iterrows():
					temp1 = self.edges_vectorizer.transform([row['rel']])
					temp2 = self.edges_vectorizer.transform([edge])
					edge_similarity = cosine_similarity(temp1, temp2).flatten().item()
					pruned.append((idx, row['conf'], similarity, edge_similarity))
				for idx, row in arg2.loc[arg2['arg2']==node].iterrows():
					temp1 = self.edges_vectorizer.transform([row['rel']])
					temp2 = self.edges_vectorizer.transform([edge])
					edge_similarity = cosine_similarity(temp1, temp2).flatten().item()
					pruned.append((idx, row['conf'], similarity, edge_similarity))
			sorted_pruned = sorted(pruned, key=lambda x:x[2]+x[3], reverse=True)
			return sorted_pruned[:min(len(sorted_pruned), 100)]

if __name__=='__main__':
	RKBG = ReverbKnowledgeBase(r'/content/drive/MyDrive/data_freebase/reverb_wikipedia_tuples-1.1.txt') #	'./sample_reverb_tuples.txt'
	# print(len(RKBG.nodes_vectorizer.vocabulary_), len(RKBG.edges_vectorizer.vocabulary_))
	# print(RKBG.tfidf_query(node='fishkind', edge='grew up in'))
	print(RKBG.tfidf_query(node='abegg', edge='did die'))

['give', 'gives', 'giving', 'gave', 'given']


Indexing ...: 100%|██████████| 407236/407236 [00:46<00:00, 8720.69it/s]


[(17829, 0.95825, 1.0, 0.9271252473625466)]
