<a href="https://colab.research.google.com/github/meti-94/OpenQA/blob/main/reverbrels0.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fuzzywuzzy -q
!pip install pattern -q
!pip install transformers -q

[K     |████████████████████████████████| 22.2 MB 1.6 MB/s 
[K     |████████████████████████████████| 87 kB 5.7 MB/s 
[K     |████████████████████████████████| 81 kB 8.4 MB/s 
[K     |████████████████████████████████| 5.6 MB 18.5 MB/s 
[K     |████████████████████████████████| 5.6 MB 38.3 MB/s 
[K     |████████████████████████████████| 419 kB 44.1 MB/s 
[K     |████████████████████████████████| 97 kB 6.3 MB/s 
[K     |████████████████████████████████| 3.6 MB 36.3 MB/s 
[?25h  Building wheel for pattern (setup.py) ... [?25l[?25hdone
  Building wheel for mysqlclient (setup.py) ... [?25l[?25hdone
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 3.1 MB 5.2 MB/s 
[K     |████████████████████████████████| 59 kB 6.5 MB/s 
[K     |████████████████████████████████| 3.3 MB 36.7 MB/s 
[K     |████████████████████████████████| 895 kB 42.2 MB/s 
[K     |████████

In [3]:
import pandas as pd
import networkx as nx
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PAST
from transformers import Trainer, TrainingArguments
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification
import sys






In [4]:
def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc
    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

In [5]:
#### pattern python>=3.7 compatibility problem
def pattern_stopiteration_workaround():
    try:
        print(lexeme('gave'))
    except:
        pass
pattern_stopiteration_workaround()


class ReverbKnowledgeBase:
	def __init__(self, path='../data/reverb_wikipedia_tuples-1.1.txt'):
		super().__init__()
		df = pd.read_csv(path, sep='\t', header=None)
		reverb_columns_name = ['ExID', 'arg1', 'rel', 'arg2', 'narg1', 'nrel', 'narg2', 'csents', 'conf', 'urls']
		df.columns = reverb_columns_name
		df = df.dropna()
		df = df.drop_duplicates()
		self.KB = df
		self.is_facts = self.KB[(self.KB.rel.apply(lambda rg:rg.find('is ')!=-1))|(self.KB.rel.apply(lambda rg:rg.find('Is ')!=-1))]
		self.nodes = self.KB['arg1'].to_list()+self.KB['arg2'].to_list()
		self.edges = self.KB['rel'].to_list()
		self.nodes_vectorizer = TfidfVectorizer()
		self.edges_vectorizer = TfidfVectorizer()
		self.nodes_tfidf = self.nodes_vectorizer.fit_transform(self.nodes)
		self.edges_tfidf = self.edges_vectorizer.fit_transform(self.edges)
		self.relations = {}
		for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Indexing ...'):
			if row['rel'] in self.relations:
				self.relations[row['rel']].append((row['arg1'], index, row['conf']))
				self.relations[row['rel']].append((row['arg2'], index, row['conf']))
			else:
				self.relations[row['rel']] = [(row['arg1'], index, row['conf'])]
				self.relations[row['rel']].append((row['arg2'], index, row['conf']))
		


	def tfidf_nodes_query(self, search_phrase, cutoff=50):
		similarities = get_tf_idf_query_similarity(self.nodes_vectorizer, self.nodes_tfidf, search_phrase)
		ranks = {k:v for k,v in zip(self.nodes, similarities)}
		sorted_ranks = {k: v for k, v in sorted(ranks.items(), key=lambda item:item[1], reverse=True)[:min(len(ranks), cutoff)]}

		return sorted_ranks

	def tfidf_edges_query(self, search_phrase, cutoff=50):
		similarities = get_tf_idf_query_similarity(self.edges_vectorizer, self.edges_tfidf, search_phrase)
		ranks = {k:v for k,v in zip(self.edges, similarities)}
		sorted_ranks = {k: v for k, v in sorted(ranks.items(), key=lambda item:item[1], reverse=True)[:min(len(ranks), cutoff)]}
		return sorted_ranks
		
	def tfidf_query(self, node='Bill Gates', edge='Born'):
		# print(edge)
		edge_list = edge.split()
		if len(edge_list)>=2 and edge_list[0]=='did':
			edge_list[1] = conjugate(verb=edge_list[1],tense=PAST)
			edge = ' '.join(edge_list[1:])
		else:
			edge = ' '.join(edge_list)
		# print(edge)
		# if edge.strip()!='is':
		edges = self.tfidf_edges_query(edge)
		return edges

if __name__=='__main__':
	RKBG = ReverbKnowledgeBase(r'/content/drive/MyDrive/data_freebase/reverb_wikipedia_tuples-1.1.txt') #	'./sample_reverb_tuples.txt'
	# print(len(RKBG.nodes_vectorizer.vocabulary_), len(RKBG.edges_vectorizer.vocabulary_))
	# print(RKBG.tfidf_query(node='fishkind', edge='grew up in'))
	print(RKBG.tfidf_query(node='abegg', edge='did die'))

Indexing ...: 100%|██████████| 407236/407236 [01:01<00:00, 6596.66it/s]


{'died': 1.0, 'died in': 0.9271252473625466, 'Died in': 0.9271252473625466, 'Died In': 0.9271252473625466, 'died of': 0.8760298375392068, 'Died of': 0.8760298375392068, 'died on': 0.80634142038981, 'Died on': 0.80634142038981, 'died for': 0.7965943041773649, 'died in in': 0.7776658632886875, 'died by': 0.7370834320223649, 'has died in': 0.7253688801545741, 'also died in': 0.7188263606915911, 'died at': 0.7015177318696876, 'Died at': 0.7015177318696876, 'has died of': 0.7001002972355879, 'also died of': 0.694212534436586, 'died from': 0.688677284452512, 'had died': 0.6853224464248049, 'died as': 0.6613942606179176, 'had died in': 0.6604505481461331, 'had died of': 0.6412071872534336, 'have died in': 0.6361575357905429, 'Then died': 0.6196817380066182, 'have died of': 0.6189065482665749, 'had died on': 0.6123124651162577, 'then died in': 0.6011115775184189, 'has died at': 0.6009314223178687, 'died out in': 0.5955113445095709, 'have died on': 0.592804151303739, 'died about': 0.58170145378

In [6]:
from sklearn.preprocessing import LabelEncoder
import numpy as np 
le = LabelEncoder()
le.classes_ = np.load('/content/drive/MyDrive/data_freebase/classes.npy')
config = AutoConfig.from_pretrained("/content/drive/MyDrive/data_freebase/classifier")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/data_freebase/classifier", num_labels=len(le.classes_))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [7]:
# convert raw text file to proper dataset object (based on task)
import torch
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # initialization
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        # slicing method X[index]
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [8]:
test_texts = pd.read_excel('/content/drive/MyDrive/data_freebase/article_step_by_step_output.xlsx')['Question'].to_list()
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=200)
test_dataset = ClassificationDataset(test_encodings, [1 for _ in test_texts])
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
  )
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
)
pred = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 26622
  Batch size = 128


In [9]:
def softmax(x):
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x

freebase = []
for idx, item in enumerate(pred.predictions):
  temp = softmax(item)
  indices = temp.argsort()
  fb = [(le.inverse_transform([indices[-i]])[0], temp[indices[-i]]) for i in range(10)]
  freebase.append(sorted(fb, key=lambda item:item[-1], reverse=True))

debug = pd.DataFrame({
                      'Question':test_texts,
                      'Freebase':freebase,
                     })


In [10]:
debug.Freebase[0]

[('fb:music.album.genre', 0.99334216),
 ('fb:music.artist.genre', 0.0015634565),
 ('fb:music.album.album_content_type', 0.0009308751),
 ('fb:film.film.genre', 0.0003940534),
 ('fb:music.genre.parent_genre', 0.00027316765),
 ('fb:media_common.netflix_title.netflix_genres', 0.00012869545),
 ('fb:tv.tv_program.genre', 0.00012649997),
 ('fb:film.film.music', 0.00011775878),
 ('fb:broadcast.content.genre', 0.000111410765),
 ('fb:astronomy.astronomer.astronomical_objects_discovered', 2.580119e-08)]

In [11]:
RKBG = ReverbKnowledgeBase(r'/content/drive/MyDrive/data_freebase/reverb_wikipedia_tuples-1.1.txt') #	'./sample_reverb_tuples.txt'
test_df = pd.read_excel('/content/drive/MyDrive/data_freebase/article_step_by_step_output.xlsx')
test_df.head()
# print(RKBG.tfidf_query(node='abegg', edge='did die'))

Indexing ...: 100%|██████████| 407236/407236 [01:03<00:00, 6450.18it/s]


Unnamed: 0.1,Unnamed: 0,Question,input_token_ids,nodes_borders,edges_spans,node,edges,question
0,0,which genre of album is harder ... ..faster ?,"[101, 2029, 6907, 1997, 2201, 2003, 6211, 1012...","[[6, 13]]",[[0 0 1 1 1 1 0 0 0 0 0 0 0 1 0]],harder . . . . . faster,genre of album is ?,"['which', 'genre', 'of', 'album', 'is', 'harde..."
1,1,what format is fearless,"[101, 2054, 4289, 2003, 22518, 102]","[[4, 5]]",[[0 0 1 1 0 0]],fearless,format is,"['what', 'format', 'is', 'fearless']"
2,2,what city was alex golfis born in,"[101, 2054, 2103, 2001, 4074, 5439, 2483, 2141...","[[4, 7]]",[[0 0 1 1 0 0 0 1 1 0]],alex golfis,city was born in,"['what', 'city', 'was', 'alex', 'golfis', 'bor..."
3,3,what film is by the writer phil hay ?,"[101, 2054, 2143, 2003, 2011, 1996, 3213, 6316...","[[7, 9]]",[[0 0 1 1 1 1 1 0 0 1 0]],phil hay,film is by the writer ?,"['what', 'film', 'is', 'by', 'the', 'writer', ..."
4,4,where did roger marquis die,"[101, 2073, 2106, 5074, 13410, 3280, 102]","[[3, 5]]",[[0 0 1 0 0 1 0]],roger marquis,did die,"['where', 'did', 'roger', 'marquis', 'die']"


In [12]:
reverb = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc='Predicting ...'):
  reverb.append(RKBG.tfidf_query(node=row['node'], edge=row['edges']))

Predicting ...: 100%|██████████| 26622/26622 [1:47:11<00:00,  4.14it/s]


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
len(reverb)

26622

In [14]:
debug['Reverb'] = reverb

In [15]:
debug.to_excel('/content/drive/MyDrive/data_freebase/dbg.xlsx', index=False)