# Imports

In [1]:
import pandas as pd
import os
import sys
from IPython.core.display import HTML

In [2]:
sys.path.append('../')

In [3]:
import spacy
from bert_serving.client import BertClient
from keyword_extraction import UnsupervisedKeywordExtraction

# Read Sample

In [4]:
def read_text(dataset, _id):
    with open(f'../data/{dataset}/docsutf8/{_id}.txt', 'r') as file:
        return file.read()

In [5]:
def read_keys(dataset, _id):
    with open(f'../data/{dataset}/keys/{_id}.key', 'r') as file:
        return file.read()

In [6]:
dataset = 'Inspec'

In [7]:
sample_id = '1100'

### text

In [8]:
text = read_text(dataset, sample_id); text

'Evaluation of existing and new feature recognition algorithms. 2. Experimental\n\tresults\nFor pt.1 see ibid., p.839-851. This is the second of two papers investigating\n\tthe performance of general-purpose feature detection techniques. The\n\tfirst paper describes the development of a methodology to synthesize\n\tpossible general feature detection face sets. Six algorithms resulting\n\tfrom the synthesis have been designed and implemented on a SUN\n\tWorkstation in C++ using ACIS as the geometric modelling system. In\n\tthis paper, extensive tests and comparative analysis are conducted on\n\tthe feature detection algorithms, using carefully selected components\n\tfrom the public domain, mostly from the National Design Repository. The\n\tresults show that the new and enhanced algorithms identify face sets\n\tthat previously published algorithms cannot detect. The tests also show\n\tthat each algorithm can detect, among other types, a certain type of\n\tfeature that is unique to it. He

### keys

In [9]:
keys = read_keys(dataset, sample_id).strip().split('\n'); keys

['feature recognition algorithms',
 'general-purpose feature detection techniques',
 '\tNational Design Repository',
 'face sets',
 'convex hull',
 'concavity',
 'feature extraction',
 'mechanical engineering']

# Extract Keywords

## Load NLP spaCy model and BERT encoder 

In [10]:
bc = BertClient(output_fmt='list')
nlp = spacy.load("en_core_web_lg", disable=['ner'])

## Embedding method: naive

In [11]:
fi = UnsupervisedKeywordExtraction(nlp=nlp,
                                   dnn=bc,
                                   emb_method='naive',
                                   mmr_beta=0.5,
                                   top_n=10,
                                   alias_threshold=0.8)

In [12]:
marked_target, keywords, keyword_relevance = fi.fit(text)



In [13]:
HTML(marked_target)

In [14]:
keywords, keyword_relevance

([('existing and new feature recognition algorithms', 14, 61),
  ('Experimental\n\tresults', 66, 87),
  ('pt.1', 92, 96),
  ('general-purpose feature detection techniques', 186, 230),
  ('The\n\tfirst paper', 232, 248),
  ('the geometric modelling system', 476, 506),
  ('the feature detection algorithms', 583, 615),
  ('carefully selected components', 623, 652),
  ('the National Design Repository', 690, 720),
  ('the new and enhanced algorithms', 745, 776)],
 [0.9989953331181924,
  0.9296324017224857,
  0.9105035056963096,
  0.9972474119155336,
  0.9514043794282492,
  0.9835447027914311,
  1.0,
  0.9484435766211964,
  0.972619561391401,
  0.9808807225904848])

## Perturbation: Removal & embedding method: subtraction

In [15]:
fi = UnsupervisedKeywordExtraction(nlp=nlp,
                                   dnn=bc,
                                   perturbation='removal',
                                   emb_method='subtraction',
                                   mmr_beta=0.5,
                                   top_n=10,
                                   alias_threshold=0.8)

In [16]:
marked_target, keywords, keyword_relevance = fi.fit(text)



In [17]:
HTML(marked_target)

In [18]:
keywords, keyword_relevance

([('pt.1', 92, 96),
  ('the synthesis', 384, 397),
  ('a SUN\n\tWorkstation', 436, 454),
  ('the geometric modelling system', 476, 506),
  ('this paper', 512, 522),
  ('the feature detection algorithms', 583, 615),
  ('the public domain', 659, 676),
  ('previously published algorithms', 802, 833),
  ('The tests', 849, 858),
  ('complete coverage', 1061, 1078)],
 [-0.3685317406253594,
  0.3528443669230563,
  0.388833519386724,
  0.5459045478592817,
  1.0,
  0.4716227386880885,
  0.1749551698341253,
  -0.014784138612829277,
  0.4964478243838283,
  -0.5888127412154336])

## Perturbation: Replacement & Embedding method: subtraction

In [23]:
fi = UnsupervisedKeywordExtraction(nlp=nlp,
                                   dnn=bc,
                                   perturbation='replacement',
                                   emb_method='subtraction',
                                   mmr_beta=0.5,
                                   top_n=10,
                                   alias_threshold=0.8)

In [24]:
marked_target, keywords, keyword_relevance = fi.fit(text)



In [25]:
HTML(marked_target)

In [26]:
keywords, keyword_relevance

([('pt.1', 92, 96),
  ('the development', 259, 274),
  ('a SUN\n\tWorkstation', 436, 454),
  ('this paper', 512, 522),
  ('the feature detection algorithms', 583, 615),
  ('the public domain', 659, 676),
  ('The\n\tresults', 722, 734),
  ('previously published algorithms', 802, 833),
  ('a certain type', 921, 935),
  ('the algorithms', 985, 999)],
 [-0.14057790849116003,
  0.11232283028643997,
  0.4851078527700543,
  -0.05361408231167253,
  0.830364179989764,
  0.3967779564285788,
  0.22571965357085863,
  0.27025812603095717,
  -0.07893360220223258,
  1.0])