In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
df = pd.read_csv('../../data/final/futurice_blog_data.csv', sep='\t', engine='python')

#Foresight and importance of foresight methods in business blog 
doc = df['text'].iloc[33]

In [43]:
# modify this to adjust the length of phrase, e.g.(2,2) for phrases of 2 words
n_gram_range = (1,2)

count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([doc])
candidates = count.get_feature_names()
#candidates



In [44]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [45]:
top_n = 10
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords

['instagram',
 'invention bitcoin',
 'technology revolutionise',
 'started cryptocurrency',
 'cryptocurrency party',
 'twitter instagram',
 'driving hype',
 'numerous podcasts',
 'instagram social',
 'cryptocurrency rapidly']

## Diversification

More diverse -> less likely to represent the document as a collective

Find the balance between accuracy of keywords and their diversity.

**Max Sum Similarity**

Maximize candidate similarity with document while minimizing similarity between the candidates.

In [46]:
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [47]:
# we have a more diverse set of keyphrases
# low nr_candidates => less diverse
# high nr_candidates => more diverse
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)

['technology revolutionise',
 'cryptocurrency party',
 'twitter instagram',
 'numerous podcasts',
 'cryptocurrency rapidly']

**Maximal Marginal Relevance**

EmedRank used.

In [48]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [50]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.2)

['cryptocurrency rapidly',
 'instagram social',
 'numerous podcasts',
 'driving hype',
 'cryptocurrency party']