In [42]:
import pandas as pd

## 1. We can load any data here

In [43]:
document = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

## 2.  We create a list of candidate keywords from the document

### n_gram_range decides if we just need a keywords or a keyphrase

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
n_gram_range = (4, 4)
stop_words = "english"

### We extract the keywords, keyphrases using CountVectorizer 

In [45]:
count = CountVectorizer(ngram_range = n_gram_range, 
                        stop_words = stop_words).fit([document])
candidates = count.get_feature_names()
candidates

['algorithm analyzes training data',
 'algorithm correctly determine class',
 'algorithm generalize training data',
 'allow algorithm correctly determine',
 'analyzes training data produces',
 'based example input output',
 'called supervisory signal supervised',
 'class labels unseen instances',
 'consisting input object typically',
 'consisting set training examples',
 'correctly determine class labels',
 'data consisting set training',
 'data produces inferred function',
 'data unseen situations reasonable',
 'desired output value called',
 'determine class labels unseen',
 'example input output pairs',
 'example pair consisting input',
 'examples optimal scenario allow',
 'examples supervised learning example',
 'function labeled training data',
 'function maps input output',
 'function used mapping new',
 'generalize training data unseen',
 'inferred function used mapping',
 'infers function labeled training',
 'input object typically vector',
 'input output based example',
 'inpu

In [46]:
# import spacy_sentence_bert
# nlp = spacy_sentence_bert.load_model('en_roberta_large_nli_stsb_mean_tokens')
# nlp

## 3. Embeddings
## We convert both the document and the candidate keywords, keyphrases to numerical data using `SentenceTransformers`

In [47]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [48]:
doc_embedding = model.encode([document])
candidate_embeddings = model.encode(candidates)

### 4. Cosine Similarity
### We find candidates that are most similar to the document using `cosine_similarity`

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords

['algorithm analyzes training data',
 'learning algorithm analyzes training',
 'supervised learning algorithm analyzes',
 'algorithm generalize training data',
 'learning algorithm generalize training']

### 5. Diversification
### We diversify the keywords, keyphrases so that we do not get similar results

### 5.1 Max Sum Similarity
### We want to maximize the candidate similarity to the document whilst minimizing the similarity between candidates

In [55]:
import numpy as np
import itertools

def max_sum(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    
    # Calculate distances and extract keywords
    distance = cosine_similarity(doc_embedding, candidate_embeddings)
    distance_candidates = cosine_similarity(candidate_embeddings, candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distance.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distance_candidates = distance_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distance_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

### Lower nr_candidate value results seem to be very similar to our original cosine similarity method

In [56]:
max_sum(doc_embedding, candidate_embeddings, candidates, top_n = 5, nr_candidates = 10)

['machine learning task learning',
 'learning function maps input',
 'signal supervised learning algorithm',
 'algorithm analyzes training data',
 'algorithm generalize training data']

### Higher nr_candidate value results will create more diverse keyphrases

In [57]:
max_sum(doc_embedding, candidate_embeddings, candidates, top_n = 5, nr_candidates = 20)

['algorithm correctly determine class',
 'analyzes training data produces',
 'instances requires learning algorithm',
 'learning function maps input',
 'supervised learning machine learning']

### `We need to keep nr_candidates less than 20% of the total number of unique words in your document`

### 5.2 Maximal Marginal Relevance
### MMR tries to minimize redundancy and maximize the diversity of results in text summarization tasks
### We start by selecting the keyword/keyphrase that is the most similar to the document. Then, we iteratively select new candidates that are both similar to the document and not similar to the already selected keywords and keyphrases

In [61]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):
    
    # Extract similarity within words
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    
    # Extract similarity between words and the document
    word_similarity = cosine_similarity(word_embeddings)
    
    # Initialize candidates and choose the best keywords, keyphrases
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
    
    for _ in range(top_n - 1):
        
        # Extract similarities within candidates and between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

### If we set a relatively low diversity, then our results seem to be very similar to our original cosine similarity method

In [64]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n = 5, diversity = 0.2)

['learning algorithm generalize training',
 'supervised learning algorithm analyzes',
 'algorithm analyzes training data',
 'algorithm generalize training data',
 'supervised learning machine learning']

### A relatively high diversity score will create very diverse keyphrases

In [65]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n = 5, diversity = 0.7)

['learning algorithm generalize training',
 'data unseen situations reasonable',
 'new examples optimal scenario',
 'value called supervisory signal',
 'algorithm correctly determine class']