<a href="https://colab.research.google.com/github/meti-94/OpenQA/blob/main/reverbrels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fuzzywuzzy -q
!pip install pattern -q
!pip install transformers -q

[K     |████████████████████████████████| 22.2 MB 2.0 MB/s 
[K     |████████████████████████████████| 87 kB 5.7 MB/s 
[K     |████████████████████████████████| 81 kB 8.6 MB/s 
[K     |████████████████████████████████| 5.6 MB 12.7 MB/s 
[K     |████████████████████████████████| 5.6 MB 12.0 MB/s 
[K     |████████████████████████████████| 419 kB 41.5 MB/s 
[K     |████████████████████████████████| 97 kB 6.0 MB/s 
[K     |████████████████████████████████| 3.6 MB 33.5 MB/s 
[?25h  Building wheel for pattern (setup.py) ... [?25l[?25hdone
  Building wheel for mysqlclient (setup.py) ... [?25l[?25hdone
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
[K     |████████████████████████████████| 895 kB 46.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 47.3 MB/s 
[K     |████████████████████████████████| 59 kB 5.7 MB/s 
[K     |████████

In [5]:
import pandas as pd
import networkx as nx
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PAST
from transformers import Trainer, TrainingArguments
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification
import sys




In [3]:
def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc
    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

In [None]:
#### pattern python>=3.7 compatibility problem
def pattern_stopiteration_workaround():
    try:
        print(lexeme('gave'))
    except:
        pass
pattern_stopiteration_workaround()


class ReverbKnowledgeBase:
	def __init__(self, path='../data/reverb_wikipedia_tuples-1.1.txt'):
		super().__init__()
		df = pd.read_csv(path, sep='\t', header=None)
		reverb_columns_name = ['ExID', 'arg1', 'rel', 'arg2', 'narg1', 'nrel', 'narg2', 'csents', 'conf', 'urls']
		df.columns = reverb_columns_name
		df = df.dropna()
		df = df.drop_duplicates()
		self.KB = df
		self.is_facts = self.KB[(self.KB.rel.apply(lambda rg:rg.find('is ')!=-1))|(self.KB.rel.apply(lambda rg:rg.find('Is ')!=-1))]
		self.nodes = self.KB['arg1'].to_list()+self.KB['arg2'].to_list()
		self.edges = self.KB['rel'].to_list()
		self.nodes_vectorizer = TfidfVectorizer()
		self.edges_vectorizer = TfidfVectorizer()
		self.nodes_tfidf = self.nodes_vectorizer.fit_transform(self.nodes)
		self.edges_tfidf = self.edges_vectorizer.fit_transform(self.edges)
		self.relations = {}
		for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Indexing ...'):
			if row['rel'] in self.relations:
				self.relations[row['rel']].append((row['arg1'], index, row['conf']))
				self.relations[row['rel']].append((row['arg2'], index, row['conf']))
			else:
				self.relations[row['rel']] = [(row['arg1'], index, row['conf'])]
				self.relations[row['rel']].append((row['arg2'], index, row['conf']))
		


	def tfidf_nodes_query(self, search_phrase, cutoff=50):
		similarities = get_tf_idf_query_similarity(self.nodes_vectorizer, self.nodes_tfidf, search_phrase)
		ranks = {k:v for k,v in zip(self.nodes, similarities)}
		sorted_ranks = {k: v for k, v in sorted(ranks.items(), key=lambda item:item[1], reverse=True)[:min(len(ranks), cutoff)]}

		return sorted_ranks

	def tfidf_edges_query(self, search_phrase, cutoff=50):
		similarities = get_tf_idf_query_similarity(self.edges_vectorizer, self.edges_tfidf, search_phrase)
		ranks = {k:v for k,v in zip(self.edges, similarities)}
		sorted_ranks = {k: v for k, v in sorted(ranks.items(), key=lambda item:item[1], reverse=True)[:min(len(ranks), cutoff)]}
		return sorted_ranks

	def tfidf_query(self, node='Bill Gates', edge='Born'):
		# print(edge)
		edge_list = edge.split()
		if len(edge_list)>=2 and edge_list[0]=='did':
			edge_list[1] = conjugate(verb=edge_list[1],tense=PAST)
			edge = ' '.join(edge_list[1:])
		else:
			edge = ' '.join(edge_list)
		# print(edge)
		if edge.strip()!='is':
			nodes = self.tfidf_nodes_query(node)
			edges = self.tfidf_edges_query(edge)
			pruned = []
			for node in nodes.keys():
				for edge in edges.keys():
					for item in self.relations[edge]:
						if item[0]==node:
							pruned.append((item[1], item[-1], nodes[node], edges[edge]))
			sorted_pruned = sorted(pruned, key=lambda x:x[2]+x[3], reverse=True)
			return sorted_pruned[:min(len(sorted_pruned), 100)]
		else:
			nodes = self.tfidf_nodes_query(node)
			arg1 = self.KB.loc[self.KB['arg1'].isin(nodes.keys())]
			arg2 = self.KB.loc[self.KB['arg2'].isin(nodes.keys())]
			# print(self.KB.loc[self.KB['arg2'].isin(nodes.keys())][:10])
			
			pruned = []
			for node, similarity in nodes.items():
				for idx, row in arg1.loc[arg1['arg1']==node].iterrows():
					temp1 = self.edges_vectorizer.transform([row['rel']])
					temp2 = self.edges_vectorizer.transform([edge])
					edge_similarity = cosine_similarity(temp1, temp2).flatten().item()
					pruned.append((idx, row['conf'], similarity, edge_similarity))
				for idx, row in arg2.loc[arg2['arg2']==node].iterrows():
					temp1 = self.edges_vectorizer.transform([row['rel']])
					temp2 = self.edges_vectorizer.transform([edge])
					edge_similarity = cosine_similarity(temp1, temp2).flatten().item()
					pruned.append((idx, row['conf'], similarity, edge_similarity))
			sorted_pruned = sorted(pruned, key=lambda x:x[2]+x[3], reverse=True)
			return sorted_pruned[:min(len(sorted_pruned), 100)]

if __name__=='__main__':
	RKBG = ReverbKnowledgeBase(r'/content/drive/MyDrive/data_freebase/reverb_wikipedia_tuples-1.1.txt') #	'./sample_reverb_tuples.txt'
	# print(len(RKBG.nodes_vectorizer.vocabulary_), len(RKBG.edges_vectorizer.vocabulary_))
	# print(RKBG.tfidf_query(node='fishkind', edge='grew up in'))
	print(RKBG.tfidf_query(node='abegg', edge='did die'))

['give', 'gives', 'giving', 'gave', 'given']


Indexing ...: 100%|██████████| 407236/407236 [00:46<00:00, 8720.69it/s]


[(17829, 0.95825, 1.0, 0.9271252473625466)]


In [15]:
from sklearn.preprocessing import LabelEncoder
import numpy as np 
le = LabelEncoder()
le.classes_ = np.load('/content/drive/MyDrive/data_freebase/classes.npy')
config = AutoConfig.from_pretrained("/content/drive/MyDrive/data_freebase/classifier")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/data_freebase/classifier", num_labels=len(le.classes_))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    "LABEL_1552": 1552,
    "LABEL_1553": 1553,
    "LABEL_1554": 1554,
    "LABEL_1555": 1555,
    "LABEL_1556": 1556,
    "LABEL_1557": 1557,
    "LABEL_1558": 1558,
    "LABEL_1559": 1559,
    "LABEL_156": 156,
    "LABEL_1560": 1560,
    "LABEL_1561": 1561,
    "LABEL_1562": 1562,
    "LABEL_1563": 1563,
    "LABEL_1564": 1564,
    "LABEL_1565": 1565,
    "LABEL_1566": 1566,
    "LABEL_1567": 1567,
    "LABEL_1568": 1568,
    "LABEL_1569": 1569,
    "LABEL_157": 157,
    "LABEL_1570": 1570,
    "LABEL_1571": 1571,
    "LABEL_1572": 1572,
    "LABEL_1573": 1573,
    "LABEL_1574": 1574,
    "LABEL_1575": 1575,
    "LABEL_1576": 1576,
    "LABEL_1577": 1577,
    "LABEL_1578": 1578,
    "LABEL_1579": 1579,
    "LABEL_158": 158,
    "LABEL_1580": 1580,
    "LABEL_1581": 1581,
    "LABEL_1582": 1582,
    "LABEL_1583": 1583,
    "LABEL_1584": 1584,
    "LABEL_1585": 1585,
    "LABEL_1586": 1586,
    "LABEL_1587": 1587,
    "

In [7]:
# convert raw text file to proper dataset object (based on task)
import torch
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # initialization
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        # slicing method X[index]
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [8]:
test_texts = pd.read_excel('/content/drive/MyDrive/data_freebase/article_step_by_step_output.xlsx')['Question'].to_list()
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=200)
test_dataset = ClassificationDataset(test_encodings, [1 for _ in test_texts])
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
  )
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
)
pred = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 26622
  Batch size = 128


In [31]:
def softmax(x):
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x

freebase = []
for idx, item in enumerate(pred.predictions):
  temp = softmax(item)
  indices = temp.argsort()
  fb = [(le.inverse_transform([indices[-i]])[0], temp[indices[-i]]) for i in range(10)]
  freebase.append(sorted(fb, key=lambda item:item[-1], reverse=True))

debug = pd.DataFrame({
                      'Question':test_texts,
                      'Freebase':freebase,
                     })


In [32]:
debug.Freebase[0]

[('fb:music.album.genre', 0.99334216),
 ('fb:music.artist.genre', 0.0015634565),
 ('fb:music.album.album_content_type', 0.0009308751),
 ('fb:film.film.genre', 0.0003940534),
 ('fb:music.genre.parent_genre', 0.00027316765),
 ('fb:media_common.netflix_title.netflix_genres', 0.00012869545),
 ('fb:tv.tv_program.genre', 0.00012649997),
 ('fb:film.film.music', 0.00011775878),
 ('fb:broadcast.content.genre', 0.000111410765),
 ('fb:astronomy.astronomer.astronomical_objects_discovered', 2.580119e-08)]

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
