In [58]:
import os

import numpy as np
import pandas as pd
import re
import string
import operator
import six
from six.moves import range
import math


import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
stopwords = stopwords.words('english')
nltk.download('wordnet')

import spacy

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# TFIDF

In [18]:
nlp = spacy.load("en_core_web_sm")

def textProcessing(doc):
    '''Prepocessing of input text with 
    1. tokenisation and Lemmatisation
    2. Removing stop words 
    3. Creating and removing custom stop words.
    4. Generating required Vocabulary from input
    5. Preprocessing the input 
    '''
    Nouns = []
    Noun_set = []
    trimmed_noun_set = []
    removing_duplicates = []
    arr = []
    vocab = []
    vocab_dict = {}

    doc = nlp(doc.lower())

    for possible_nouns in doc:
        if possible_nouns.pos_ in ["NOUN","PROPN"] :
            Nouns.append([possible_nouns , [child for child in possible_nouns.children]])
       
    
    for i,j in Nouns:
        for k in j:
            Noun_set.append([k,i])

    
    for i , j in Noun_set:
        if i.pos_ in ['PROPN','NOUN','ADJ']:
            trimmed_noun_set.append([i ,j])
            
    
    for word in trimmed_noun_set:
        if word not in removing_duplicates:
            removing_duplicates.append(word)
    
    
    for i in removing_duplicates:
        strs = ''
        for j in i:
            strs += str(j)+" "
        arr.append(strs.strip())

    
    for word in Noun_set:
        string = ''
        for j in word:
            string+= str(j)+ " "
        vocab.append(string.strip())

    
    for word in vocab:
        vocab_dict[word]= 0
        
    for word in arr:
        vocab_dict[word]+= 1

    return vocab_dict , arr

def computeTF(wordDict,bow):
    '''Computing TF(Term Frequency of the vocab) '''
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict


def computeIDF(doclist):
    '''Computing IDF for the vocab '''
    import math 
    count = 0
    idfDict = {}
    for element in doclist:
        for j in element:
            count+=1
    N = count

    # count no of documents that contain the word w
    idfDict = dict.fromkeys(doclist[0].keys(),0)

    for doc in doclist:
        for word,val in doc.items():
            if val>0:
                idfDict[word]+= 1

    # divide N by denominator above
    for word,val in idfDict.items():
        if val == 0:
            idfDict[word] = 0.0
        else:
            idfDict[word] = math.log(N / float(val))

    return idfDict

def computeTfidf(tf,idf):
    '''Computing TF-IDF for the words in text '''
    tfidf = {}
    sorted_list = []
    for word , val in tf.items():
        tfidf[word] = val * idf[word]

    ranking_list  = sorted(tfidf.items(),reverse=True, key = lambda kv:(kv[1], kv[0]))[:10]
    for i, _ in ranking_list:
        sorted_list.append(i)

    return ','.join([str(elem) for elem in sorted_list])

In [46]:
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return ' '.join([lower(x) for x in s]).rstrip()

In [47]:
def remove_empty(a_list):
    new_list = []
    for i in a_list:
        if len(i) > 0:
            if len(i[0]) >0:
                new_list.append(normalize_answer(i))   
    return new_list

In [50]:
def get_score_full(candidates, references, maxDepth = 30):
    precision = []
    recall = []
    reference_set = set(dedup(references))
    candidates = dedup(candidates)
    referencelen = len(reference_set)
    true_positive = 0
    for i in range(maxDepth):
        if len(candidates) > i:
            kp_pred = candidates[i]     
            if kp_pred in reference_set:
                true_positive += 1
        precision.append(true_positive/float(i + 1))
        recall.append(true_positive/float(referencelen))
    return precision, recall

In [53]:
def dedup(kp_list):
    dedupset = set()
    kp_list_dedup = []
    for kp in kp_list:
        if kp in dedupset:
            continue       
        kp_list_dedup.append(kp)
        dedupset.add(kp)
    return kp_list_dedup

In [51]:
def evaluate(candidates, references, data):
    precision_scores, recall_scores, f1_scores = {1:[], 3:[], 5:[], 10:[], 30:[]}, {1:[], 3:[], 5:[], 10:[], 30:[]}, {1:[], 3:[], 5:[], 10:[], 30:[]}
    for url in range(len(data)):
        candidate = remove_empty(candidates.iloc[url])
        reference = remove_empty(references.iloc[url])
        p, r = get_score_full(candidate, reference) 
        for i in [1,3,5,10,30]:
            precision = p[i-1]
            recall = r[i-1]
            if precision + recall > 0:
                f1_scores[i].append((2 * (precision * recall)) / (precision + recall))
            else:
                f1_scores[i].append(0)
            precision_scores[i].append(precision)
            recall_scores[i].append(recall)
    print("########################\nMetrics")
    for i in precision_scores:
        print("@{}".format(i))
        print("F1:{}".format(np.mean(f1_scores[i])))
        print("P:{}".format(np.mean(precision_scores[i])))
        print("R:{}".format(np.mean(recall_scores[i])))
    print("#########################")

### KP 20K

In [3]:
!unzip '/content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/kp20k_new.zip'

Archive:  /content/drive/My Drive/Colab Notebooks/Advanced NLP-Project/Project/kp20k_new.zip
  inflating: kp20k_testing.json      
  inflating: kp20k_training.json     
  inflating: kp20k_validation.json   


In [4]:
import json 
test = []
for line in open('/content/kp20k_testing.json', 'r'):
    test.append(json.loads(line))

In [7]:
test_data = pd.DataFrame(test)

In [9]:
test_data['keyword'] = test_data['keyword'].str.replace(';',',')

In [22]:
test_data

Unnamed: 0,abstract,keyword,title,tfidf
0,A feedback vertex set of a graph G is a set S ...,"feedback vertex set,decycling set,2-degenerate...",A feedback vertex set of 2-degenerate graphs,
1,This article proposes techniques to predict th...,"performance,analytical modeling,pending hit,da...",Hybrid Analytical Modeling of Pending Cache Hi...,
2,Autoimmune polyendocrinopathy candidiasis ecto...,"apeced,aire,chronic mucocutaneous candidiasis,...",Autoimmune polyendocrinopathy candidiasis ecto...,
3,"In this paper, we consider an enthalpy formula...","casting,thermal,conduction,convection,finite e...",Numerical solution of a three-dimensional soli...,
4,"In this research, a new type of manufacturing ...","feature recognition,rib,aircraft structural pa...",Definition and recognition of rib features in ...,
...,...,...,...,...
19995,Energy efficiency and transmission delay are v...,"energy efficiency,delay,unreliable links,wirel...",Energy-delay tradeoff in wireless multihop net...,
19996,This paper describes the design and implementa...,"e-medical records,e-health,e-clinic,web-based,...",A Cyber Medical Center,
19997,This work describes a detailed simulation-base...,"wireless lan,quality of service,medium access ...",adapting wlan mac parameters to enhance voip c...,
19998,This paper describes a conceptually simple but...,"interior point methods,ellipsoid method,multio...",An interior point multiobjective programming a...,


In [16]:
tfidf_lst = []
for i in test_data.abstract[0:5]:
  vocab_dict , arr = textProcessing(i)
  tf = computeTF(vocab_dict,arr)
  idf = computeIDF([vocab_dict])
  tfidf = computeTfidf(tf,idf)
  tfidf_lst.append(tfidf)

In [20]:
test_data['tfidf'] = ''

In [29]:
test_data_5 = test_data[0:5]

In [None]:
%%time
for key,val in enumerate(test_data_5.abstract):
  try:
    vocab_dict , arr = textProcessing(val)
    tf = computeTF(vocab_dict,arr)
    idf = computeIDF([vocab_dict])
    tfidf = computeTfidf(tf,idf)
    print(tfidf)
    print('======')
    test_data_5['tfidf'].loc[key] = tfidf
  except ZeroDivisionError:
    pass

In [40]:
%%time
for key,val in enumerate(test_data.abstract):
  try:
    vocab_dict , arr = textProcessing(val)
    tf = computeTF(vocab_dict,arr)
    idf = computeIDF([vocab_dict])
    tfidf = computeTfidf(tf,idf)
    test_data['tfidf'].loc[key] = tfidf
  except ZeroDivisionError:
    pass

CPU times: user 10min 30s, sys: 2.92 s, total: 10min 33s
Wall time: 10min 34s


In [None]:
# vocab_dict , arr = textProcessing(text[0])
# tf = computeTF(vocab_dict,arr)
# idf = computeIDF([vocab_dict])
# tfidf = computeTfidf(tf,idf)

In [26]:
#test_data['vocab_dict'],test_data['arr'] = test_data['abstract'].apply(lambda x : textProcessing(x))

In [41]:
test_data

Unnamed: 0,abstract,keyword,title,tfidf
0,A feedback vertex set of a graph G is a set S ...,"feedback vertex set,decycling set,2-degenerate...",A feedback vertex set of 2-degenerate graphs,"graph g,feedback vertex,vertex v,vertex set,ve..."
1,This article proposes techniques to predict th...,"performance,analytical modeling,pending hit,da...",Hybrid Analytical Modeling of Pending Cache Hi...,"limited number,uniform latency,superscalar mic..."
2,Autoimmune polyendocrinopathy candidiasis ecto...,"apeced,aire,chronic mucocutaneous candidiasis,...",Autoimmune polyendocrinopathy candidiasis ecto...,"various diseases,several lessons,recessive dis..."
3,"In this paper, we consider an enthalpy formula...","casting,thermal,conduction,convection,finite e...",Numerical solution of a three-dimensional soli...,"stefan problem,phase problem,numerical results..."
4,"In this research, a new type of manufacturing ...","feature recognition,rib,aircraft structural pa...",Definition and recognition of rib features in ...,"rib elements,local elements,structural parts,r..."
...,...,...,...,...
19995,Energy efficiency and transmission delay are v...,"energy efficiency,delay,unreliable links,wirel...",Energy-delay tradeoff in wireless multihop net...,"energy efficiency,wireless networks,physical p..."
19996,This paper describes the design and implementa...,"e-medical records,e-health,e-clinic,web-based,...",A Cyber Medical Center,"medical center,medical records,traditional sys..."
19997,This work describes a detailed simulation-base...,"wireless lan,quality of service,medium access ...",adapting wlan mac parameters to enhance voip c...,"new scheme,wlan parameters,wlan network,wirele..."
19998,This paper describes a conceptually simple but...,"interior point methods,ellipsoid method,multio...",An interior point multiobjective programming a...,"wider range,uncertain information,traditional ..."


In [54]:
evaluate(test_data['tfidf'],test_data['keyword'],test_data)

########################
Metrics
@1
F1:0.08285289412526706
P:0.91
R:0.04344140162474216
@3
F1:0.23565650575554
P:0.9408333333333332
R:0.13502992904701944
@5
F1:0.3642462818151996
P:0.9460900000000001
R:0.2264027117841467
@10
F1:0.6105036847025352
P:0.94711
R:0.45320779700101166
@30
F1:0.7550764557012816
P:0.6452949999999998
R:0.9193874406025488
#########################


### Inspec

In [None]:
!unzip '/content/drive/MyDrive/Colab Notebooks/Advanced NLP-Project/Project/Inspec.zip'

In [56]:
data = '/content/Inspec/docsutf8/'
keys = '/content/Inspec/keys/'


In [59]:
files = os.listdir(data)
key_files = os.listdir(keys)

In [60]:
len(files)

2000

In [61]:
text = []
for file in files[:2000]:
  with open('/content/Inspec/docsutf8/'+file, 'r') as in_file:
    data = in_file.read()
    text.append(data)

In [64]:
#candidates = []
references = []

for file in files[:2000]:
    # with open(data+file, 'r') as in_file: 
    #     text = in_file.read()
    #     candidates.append({'url':file,
    #                         'KeyPhrases':get_kp(text)})
    
    name = file.split('.')[0]
    with open(keys+name+'.key', 'r') as in_file:
        can = in_file.readlines()
        can = [line.rstrip('\n').split() for line in can]
        references.append(can)
    


# with open('result.json', 'w') as out_file:
#     for candidate in candidates:
#         json.dump(str(candidate), out_file)
#         out_file.write('\n')
with open('keys.json', 'w') as out_file:
    for ref in references:
        json.dump(ref, out_file)
        out_file.write('\n')

In [69]:
ref_phrases = []
for i in range(len(references)):
  phrases = [' '.join(ref) for ref in references[i]]
  ref_phrases.append(','.join(phrases))

In [70]:
inspec_data = pd.DataFrame(
    {'text': text,
     'keyword': ref_phrases
    })

In [71]:
inspec_data

Unnamed: 0,text,keyword
0,Wavelet-based level-of-detail representation o...,"3D object level of detail modeling system,wave..."
1,Neural networks in optimal filtration\nThe com...,"optimal filtering,neural networks,linear filte..."
2,A new high resolution color flow system using ...,"high resolution colour flow system,eigendecomp..."
3,New hub gears up for algorithmic exchange\nWar...,Warwick University Centre for Scientific Compu...
4,Geometric source separation: merging convoluti...,"geometric source separation,geometric beamform..."
...,...,...
1995,Academic libraries and community: making the c...,"academic libraries,community partnerships,camp..."
1996,CRONE control: principles and extension to tim...,"CRONE control,time-variant plants,asymptotical..."
1997,Mining the optimal class association rule set\...,"optimal class association rule set mining,mini..."
1998,Dynamic testing of inflatable structures using...,"thin-film torus,smart materials,satellite appl..."


In [72]:
inspec_data['keyword'] = inspec_data['keyword'].str.lower()

In [73]:
inspec_data['tfidf'] = ''

In [74]:
%%time
for key,val in enumerate(inspec_data.text):
  try:
    vocab_dict , arr = textProcessing(val)
    tf = computeTF(vocab_dict,arr)
    idf = computeIDF([vocab_dict])
    tfidf = computeTfidf(tf,idf)
    inspec_data['tfidf'].loc[key] = tfidf
  except ZeroDivisionError:
    pass

CPU times: user 58.2 s, sys: 421 ms, total: 58.6 s
Wall time: 58.7 s


In [75]:
evaluate(inspec_data['tfidf'],inspec_data['keyword'],inspec_data)

########################
Metrics
@1
F1:0.07797659374394664
P:0.9835
R:0.0406180012396357
@3
F1:0.21771943279323885
P:0.9883333333333333
R:0.12250416721885322
@5
F1:0.3382105504391887
P:0.9894
R:0.20440563555366723
@10
F1:0.576926098316625
P:0.989
R:0.4087391400053235
@30
F1:0.7902624322314816
P:0.7175333333333334
R:0.8851759127346382
#########################


# TextRank

In [76]:
def clean(text):
    text = text.lower()
    printable = set(string.printable)
    text = filter(lambda x: x in printable, text)
    text = "".join(list(text))
    return text

def TextScoring(text):
  cleaned_text = clean(text)
  text = word_tokenize(cleaned_text)
  pos_tag = nltk.pos_tag(text)
  wordnet_lemmatizer = WordNetLemmatizer()
  adjective_tags = ['JJ','JJR','JJS']
  lemmatized_text = []
  for word in pos_tag:
      if word[1] in adjective_tags:
          lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
      else:
          lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0])))
  pos_tag = nltk.pos_tag(lemmatized_text)
  stopwords = []
  wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS'] 
  for word in pos_tag:
      if word[1] not in wanted_POS:
          stopwords.append(word[0])
  punctuations = list(str(string.punctuation))
  stopwords = stopwords + punctuations
  processed_text = []
  for word in lemmatized_text:
      if word not in stopwords:
          processed_text.append(word)
  vocabulary = list(set(processed_text))
  vocab_len = len(vocabulary)

  weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

  score = np.zeros((vocab_len),dtype=np.float32)
  window_size = 3
  covered_coocurrences = []

  for i in range(0,vocab_len):
      score[i]=1
      for j in range(0,vocab_len):
          if j==i:
              weighted_edge[i][j]=0
          else:
              for window_start in range(0,(len(processed_text)-window_size)):
                  
                  window_end = window_start+window_size
                  
                  window = processed_text[window_start:window_end]
                  
                  if (vocabulary[i] in window) and (vocabulary[j] in window):
                      
                      index_of_i = window_start + window.index(vocabulary[i])
                      index_of_j = window_start + window.index(vocabulary[j])
                      
                      # index_of_x is the absolute position of the xth term in the window 
                      # (counting from 0) 
                      # in the processed_text
                        
                      if [index_of_i,index_of_j] not in covered_coocurrences:
                          weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                          covered_coocurrences.append([index_of_i,index_of_j])

  inout = np.zeros((vocab_len),dtype=np.float32)

  for i in range(0,vocab_len):
      for j in range(0,vocab_len):
          inout[i]+=weighted_edge[i][j]

  MAX_ITERATIONS = 50
  d=0.85
  threshold = 0.0001 #convergence threshold

  for iter in range(0,MAX_ITERATIONS):
      prev_score = np.copy(score)
      
      for i in range(0,vocab_len):
          
          summation = 0
          for j in range(0,vocab_len):
              if weighted_edge[i][j] != 0:
                  summation += (weighted_edge[i][j]/inout[j])*score[j]
                  
          score[i] = (1-d) + d*(summation)
      
      if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
          #print("Converging at iteration "+str(iter)+"....")
          break
  phrases = []

  phrase = " "
  for word in lemmatized_text:
      
      if word in stopwords:
          if phrase!= " ":
              phrases.append(str(phrase).strip().split())
          phrase = " "
      elif word not in stopwords:
          phrase+=str(word)
          phrase+=" "

  unique_phrases = []
  for phrase in phrases:
      if phrase not in unique_phrases:
          unique_phrases.append(phrase)

  for word in vocabulary:
      #print word
      for phrase in unique_phrases:
          if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
              unique_phrases.remove([word])

  phrase_scores = []
  keywords = []
  for phrase in unique_phrases:
      phrase_score=0
      keyword = ''
      for word in phrase:
          keyword += str(word)
          keyword += " "
          phrase_score+=score[vocabulary.index(word)]
      phrase_scores.append(phrase_score)
      keywords.append(keyword.strip())

  res = {keywords[i]: phrase_scores[i] for i in range(len(keywords))}
  sorted_index = np.flip(np.argsort(phrase_scores),0)
  keywords_num = len(keywords)
  final_keywords = []
  for i in range(0,keywords_num):
    final_keywords.append(str(keywords[sorted_index[i]]))
  return final_keywords


### KP 20K

In [82]:
%%time
test_data['textrank'] = test_data['abstract'].apply(lambda x: ",".join(TextScoring(x)))

CPU times: user 1h 13min 30s, sys: 2.66 s, total: 1h 13min 32s
Wall time: 1h 13min 38s


In [83]:
evaluate(test_data['textrank'],test_data['keyword'],test_data)

########################
Metrics
@1
F1:0.08421447574685026
P:0.92385
R:0.04416103114938584
@3
F1:0.23246069839880634
P:0.92825
R:0.13320699235767636
@5
F1:0.35913329645072317
P:0.9332400000000002
R:0.2231943669216439
@10
F1:0.6027119712350422
P:0.9355400000000001
R:0.44726481451268174
@30
F1:0.8050724486213954
P:0.6888683333333333
R:0.9785478703172245
#########################


### Inspec

In [84]:
inspec_data

Unnamed: 0,text,keyword,tfidf
0,Wavelet-based level-of-detail representation o...,"3d object level of detail modeling system,wave...","wavelet transform,range images,initial mesh,wa..."
1,Neural networks in optimal filtration\nThe com...,"optimal filtering,neural networks,linear filte...","neural networks,white noise,telegraph signal,s..."
2,A new high resolution color flow system using ...,"high resolution colour flow system,eigendecomp...","color flow,tissue motion,clutter rejection,vel..."
3,New hub gears up for algorithmic exchange\nWar...,warwick university centre for scientific compu...,"warwick university,uk exercise,typical 1960s,s..."
4,Geometric source separation: merging convoluti...,"geometric source separation,geometric beamform...","source separation,undesired interferences,sour..."
...,...,...,...
1995,Academic libraries and community: making the c...,"academic libraries,community partnerships,camp...","academic libraries,broader community,various u..."
1996,CRONE control: principles and extension to tim...,"crone control,time-variant plants,asymptotical...","variant plants,crone control,constant coeffici..."
1997,Mining the optimal class association rule set\...,"optimal class association rule set mining,mini...","class association,association rule,optimal rul..."
1998,Dynamic testing of inflatable structures using...,"thin-film torus,smart materials,satellite appl...","smart materials,vibration testing,inflated str..."


In [85]:
%%time
inspec_data['textrank'] = inspec_data['text'].apply(lambda x: ",".join(TextScoring(x)))

CPU times: user 5min 9s, sys: 198 ms, total: 5min 9s
Wall time: 5min 9s


In [86]:
evaluate(inspec_data['textrank'],inspec_data['keyword'],inspec_data)

########################
Metrics
@1
F1:0.07825176393917326
P:0.9865
R:0.040762354515670014
@3
F1:0.21655609182621555
P:0.9828333333333333
R:0.12185486437709807
@5
F1:0.33662442617902716
P:0.9846
R:0.20345775298480362
@10
F1:0.5754224742236236
P:0.9863999999999999
R:0.407680187668283
@30
F1:0.8688829420518646
P:0.7905166666666668
R:0.9707538140824862
#########################
