# BERT-Word2Vec Hybrid Keyphrase Extractor

BERT has seen some extensive use in the kephrase extraction domain, and proves to be one of the best present keyphrase extraction tools, as seen with the [KeyBERT](https://github.com/MaartenGr/KeyBERT) model. This however is not enough, as it doesn't cover Absent Keyphrases which are phrases that do not exist in the text but hold semantic value nonetheless. This is where Word2Vec can be used to generate these keyphrases. The following model was tested using the Inspec dataset.

In [None]:
pip install keybert

In [None]:
from keybert import KeyBERT
from pathlib import Path
import glob
import os
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Global variables
top_n = 5
word2vec_model_path = '/content/drive/MyDrive/word2vec/GoogleNews-vectors-negative300.bin' #provide path to GoogleNews-vectors-negative300.bin

# Present Keyphrase Extraction
The following cell utilizes the pretrained KeyBERT model to extract present keyphrases in a document.

In [None]:
class PKey_Extraction:

    def __init__(self, input_dataset='../Inspec/docsutf8/'):
        self.input_dataset= input_dataset

        #check if output directories exist...
        Path("./Output/").mkdir(parents=True, exist_ok=True)
        Path("./Output/AKE").mkdir(parents=True, exist_ok=True)
        Path("./Output/PKE/").mkdir(parents=True, exist_ok=True)

        self.kw_model = KeyBERT()

    def extract_presentKeyphrases(self):
      counter = 0
      #iterate over all files in the dataset ...
      fNames= glob.glob(self.input_dataset + '/*txt')
      for file in fNames:
        counter += 1
        if counter > 500:
          break
        print(f"Processing file: {file}")
        # read the content of the input document.
        input_doc = open(file, mode='r').read()
        input_doc=input_doc.replace('\t', ' ').replace('\n', ' ')

        # extract present keyphrases
        keywords = self.kw_model.extract_keywords(input_doc, keyphrase_ngram_range=(1, 1),
                                        stop_words='english', use_mmr=True, diversity=0.5, top_n=top_n)

        # save keywods without relevance score into file
        final_keywords=""
        for keyword in keywords:
          final_keywords+=keyword[0]+"\n"

        with open('./Output/PKE/'+file.split('/')[-1], 'w') as outFile:
          outFile.writelines(final_keywords.rstrip())
        print(f"Keyphrases written to {outFile.name}")
        outFile.close()

In [None]:
Extractor = PKey_Extraction(input_dataset='/content/drive/MyDrive/Inspec/docsutf8') #provide path to docutf8 folder in Inspec

In [None]:
keywords = Extractor.extract_presentKeyphrases()

# Absent Keyphrase Generation
The 3 functions and code cells below are used for Absent Keyphrase generation. The most_similar() function in Word2vec is used to generate absent keyphrases from the present keyphrases that are fed in as parameters.

In [None]:
# Load pre-trained Word2Vec model from google drive
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [None]:
# Example: Find similar words using Word2Vec
def word2vec(keywords):
  similar_words = {}
  for word in keywords:
    if word not in word2vec_model.key_to_index:
      continue
    similar_words.update(word2vec_model.most_similar(word, topn=top_n))
  return similar_words

In [None]:
def postprocess(words, doc):
  words = list(words.keys())
  lowercase_words = [word.lower().replace('_', ' ') for word in words]
  absents = []
  for word in lowercase_words:
    if word not in doc:
      absents.append(word)
  return absents

In [None]:
def absentRelevance(absent_words, doc):
  kw_model = SentenceTransformer('all-MiniLM-L6-v2')
  doc_embedding = kw_model.encode(doc)
  word_embedding = kw_model.encode(absent_words)
  cosine_score = util.pytorch_cos_sim(doc_embedding, word_embedding)
  cosine_scores_list = cosine_score.flatten().tolist()
  word_score_dict = dict(zip(absent_words, cosine_scores_list))
  word_score_dict = dict(sorted(word_score_dict.items(), key=lambda item: item[1], reverse=True))
  return word_score_dict

In [None]:
fNames1 = glob.glob('/content/Output/PKE' + '/*txt')
for file_path in fNames1:
  filename = os.path.basename(file_path)
  present_phrases = []
  with open(file_path, 'r') as file:
    # Extend the present_phrases list with lines from this file
    present_phrases.extend([line.strip() for line in file])
  with open('/content/drive/MyDrive/Inspec/docsutf8/'+filename, 'r', encoding='utf-8') as file: #provide path to docutf8 folder in Inspec
    # Read the entire content of the file
    content = file.read()
  word2vec_phrases = word2vec(present_phrases)
  absents = postprocess(word2vec_phrases, content)
  absent_phrases = absentRelevance(absents, content)
  absent_phrases = list(absent_phrases.keys())[:top_n]

  # save Absent keywords into file
  absent_keywords=""
  for keyword in absent_phrases:
    absent_keywords+=keyword+"\n"

  with open('./Output/AKE/'+filename.split('/')[-1], 'w') as outFile:
    outFile.writelines(absent_keywords.rstrip())
  print(f"Keyphrases written to {outFile.name}")
  outFile.close()

# Evaluation
The following cells are only for evaluation purposes, the absent and present keyphrases should be saved in the Output/AKE and Output/PKE directories respectively.

In [None]:
def cosine_similarity(phrases, gold):
  kw_model = SentenceTransformer('all-MiniLM-L6-v2')
  phrases_embedding = kw_model.encode(phrases)
  gold_embedding = kw_model.encode(gold)
  cosine_score = util.pytorch_cos_sim(phrases_embedding, gold_embedding)
  cosine_scores_list = cosine_score.flatten().tolist()
  return cosine_scores_list

def compute_evaluation(cosine_scores):

    num_of_Similar=0 #number of similar keyphrases, we add 1 to avoid division by zero (i.e., smoothing)

    for similarty_score in cosine_scores:
        if any(similarity_threshould > 0.8 for similarity_threshould in similarty_score):
            num_of_Similar+=1

    recall= num_of_Similar/len(cosine_scores)
    precision= num_of_Similar/len(cosine_scores)


    return round(precision, 3), round(recall, 3)

In [None]:
fNames1 = glob.glob('/content/Output/PKE' + '/*txt')
cos_sim = []
for file_path in fNames1:
  filename = os.path.basename(file_path)
  filename = os.path.splitext(filename)[0]
  with open(file_path, 'r') as file:
    # Extend the present_phrases list with lines from this file
    present = file.read()
  with open('/content/Output/AKE/'+filename+'.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    absent = file.read()
  with open('/content/drive/MyDrive/Inspec/keys/'+filename+'.key', 'r', encoding='utf-8') as file: #provide path to keys folder in Inspec
    # Read the entire content of the file
    gold_keys = file.read()

  cos_sim.append(cosine_similarity(present, gold_keys))
precision, recall = compute_evaluation(cos_sim)

In [None]:
fNames1 = glob.glob('/content/Output/PKE' + '/*txt')
cos_sim1 = []
for file_path in fNames1:
  filename = os.path.basename(file_path)
  filename = os.path.splitext(filename)[0]
  with open(file_path, 'r') as file:
    # Extend the present_phrases list with lines from this file
    present = file.read()
  with open('/content/Output/AKE/'+filename+'.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    absent = file.read()
  with open('/content/drive/MyDrive/Inspec/keys/'+filename+'.key', 'r', encoding='utf-8') as file: #provide path to keys folder in Inspec
    # Read the entire content of the file
    gold_keys = file.read()

  cos_sim1.append(cosine_similarity(absent, gold_keys))
precision1, recall1 = compute_evaluation(cos_sim1)

In [None]:
#Present keyphrases Recall score
print(recall)

In [None]:
#Absent keyphrases Recall score
print(recall1)