In [24]:
import pandas as pd

## 1. We can load any data here

In [32]:
document = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

## 2.  We create a list of candidate keywords from the document

### n_gram_range decides if we just need a keywords or a keyphrase

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
n_gram_range = (4, 4)
stop_words = "english"

### We extract the keywords, keyphrases using CountVectorizer 

In [27]:
count = CountVectorizer(ngram_range = n_gram_range, 
                        stop_words = stop_words).fit([document])
candidates = count.get_feature_names()
candidates

['algorithm analyzes training data',
 'algorithm correctly determine class',
 'algorithm generalize training data',
 'allow algorithm correctly determine',
 'analyzes training data produces',
 'based example input output',
 'called supervisory signal supervised',
 'class labels unseen instances',
 'consisting input object typically',
 'consisting set training examples',
 'correctly determine class labels',
 'data consisting set training',
 'data produces inferred function',
 'data unseen situations reasonable',
 'desired output value called',
 'determine class labels unseen',
 'example input output pairs',
 'example pair consisting input',
 'examples optimal scenario allow',
 'examples supervised learning example',
 'function labeled training data',
 'function maps input output',
 'function used mapping new',
 'generalize training data unseen',
 'inferred function used mapping',
 'infers function labeled training',
 'input object typically vector',
 'input output based example',
 'inpu

In [33]:
# import spacy_sentence_bert
# nlp = spacy_sentence_bert.load_model('en_roberta_large_nli_stsb_mean_tokens')
# nlp

## 3. Embeddings
## We convert both the document and the candidate keywords, keyphrases to numerical data using `SentenceTransformers`

In [29]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [30]:
doc_embedding = model.encode([document])
candidate_embeddings = model.encode(candidates)

### 4. Cosine Similarity
### We find candidates that are most similar to the document using `cosine_similarity`

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords

['algorithm analyzes training data',
 'learning algorithm analyzes training',
 'supervised learning algorithm analyzes',
 'algorithm generalize training data',
 'learning algorithm generalize training']