# From Scratch

In [None]:
!pip install spacy transformers

In [2]:
text = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
n_gram_range = (1, 2)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
all_candidates = count.get_feature_names()

In [8]:
all_candidates[:6]

['algorithm',
 'algorithm analyzes',
 'algorithm correctly',
 'algorithm generalize',
 'allow',
 'allow algorithm']

We need POS to get rid of verbs

In [9]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)

In [11]:
nouns = set()
for token in doc:
    if token.pos_ == "NOUN":
        nouns.add(token.text)

In [12]:
all_nouns = nouns.union(noun_phrases)

In [13]:
candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))

In [14]:
candidates

['algorithm',
 'bias',
 'class',
 'consisting',
 'data',
 'example',
 'examples',
 'function',
 'inductive bias',
 'input',
 'instances',
 'labels',
 'learning',
 'machine',
 'object',
 'output',
 'pair',
 'scenario',
 'set',
 'signal',
 'situations',
 'supervised learning',
 'task',
 'training',
 'unseen situations',
 'value',
 'vector',
 'way']

In [15]:
model_name = "distilroberta-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331070498.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [16]:
candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
candidate_embeddings = model(**candidate_tokens)["pooler_output"]

In [18]:
candidate_embeddings.shape

torch.Size([28, 768])

In [19]:
text_tokens = tokenizer([text], padding=True, return_tensors="pt")
text_embedding = model(**text_tokens)["pooler_output"]

In [20]:
text_embedding.shape

torch.Size([1, 768])

In [23]:
candidate_embeddings = candidate_embeddings.detach().numpy()
text_embedding = text_embedding.detach().numpy()

Now that the embeddings are done, we need to calculate the distance between the candidates and text embeddings to find the closest ones

In [24]:
top_k = 5
distances = cosine_similarity(text_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_k:]]

In [25]:
keywords

['input', 'algorithm', 'examples', 'supervised learning', 'example']

# KEYBERT

Keybert is an intersting package that can perform KW with BERT embeddings

In [26]:
pip install keybert[all]

Collecting keybert[all]
  Downloading https://files.pythonhosted.org/packages/db/fb/822e7094457cd16319291e34aa97b2ef97620da01af94fe557e96a8cc6b9/keybert-0.3.0.tar.gz
Collecting sentence-transformers>=0.3.8
[?25l  Downloading https://files.pythonhosted.org/packages/cc/75/df441011cd1726822b70fbff50042adb4860e9327b99b346154ead704c44/sentence-transformers-1.2.0.tar.gz (81kB)
[K     |████████████████████████████████| 81kB 2.5MB/s 
Collecting torch<1.7.1,>=1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/d9/74/d52c014fbfb50aefc084d2bf5ffaa0a8456f69c586782b59f93ef45e2da9/torch-1.7.0-cp37-cp37m-manylinux1_x86_64.whl (776.7MB)
[K     |████████████████████████████████| 776.8MB 20kB/s 
[?25hCollecting flair==0.7
[?25l  Downloading https://files.pythonhosted.org/packages/68/a0/a1b41fa2fcb23ff71ba9148af75211dcccc35b256dea821b36e1ee871848/flair-0.7-py3-none-any.whl (448kB)
[K     |████████████████████████████████| 450kB 39.1MB/s 
[?25hCollecting spacy>=3.0.1
[?25l  Downloadi

In [27]:
from keybert import KeyBERT

doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs.[1] It infers a
         function from labeled training data consisting of a set of training examples.[2]
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal). 
         A supervised learning algorithm analyzes the training data and produces an inferred function, 
         which can be used for mapping new examples. An optimal scenario will allow for the 
         algorithm to correctly determine the class labels for unseen instances. This requires 
         the learning algorithm to generalize from the training data to unseen situations in a 
         'reasonable' way (see inductive bias).
      """
kw_model = KeyBERT('distilbert-base-nli-mean-tokens')

HBox(children=(FloatProgress(value=0.0, max=244733649.0), HTML(value='')))




In [29]:
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words='english')

In [30]:
keywords

[('learning algorithm', 0.6979),
 ('learning machine', 0.6327),
 ('machine learning', 0.6306),
 ('supervised learning', 0.5986),
 ('algorithm analyzes', 0.586)]

In [31]:
diversified_keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words='english', 
                              use_mmr=True, diversity=0.7)

In [32]:
diversified_keywords

[('learning algorithm', 0.6979),
 ('labels unseen', 0.089),
 ('new examples', 0.2852),
 ('generalize training', 0.4835),
 ('algorithm correctly', 0.3867)]