# Topic Modeling


References:  
- https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea
- https://jaketae.github.io/study/zero-shot-classification/


In [135]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import spacy
#from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.nn import functional as F
import numpy as np
import itertools
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

## Get document

In [224]:
url = 'https://towardsdatascience.com/understanding-random-forest-58381e0602d2'
#url='https://towardsdatascience.com/a-one-stop-shop-for-principal-component-analysis-5582fb7e0a9c'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

article_text = soup.text

#h1s = soup.find_all(['h1','h2','h3'])
#h1s = [i.text for i in h1s]
bodytext = soup.find_all('p')
bodytext = [i.text for i in bodytext]
article_text = ' '.join(bodytext)

In [226]:
n_gram_range = (1, 2)
stop_words = "english"

# Extract candidate words/phrases
vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words,token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}').fit([article_text])
candidates = vectorizer.get_feature_names()

# Get noun phrases from article
nlp = spacy.load('en_core_web_sm')
doc = nlp(article_text)
noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)

# Get nouns from article
nouns = set()
for token in doc:
    if token.pos_ == "NOUN":
        nouns.add(token.text)
        
all_nouns = nouns.union(noun_phrases)

# Filter candidate topics to only those in the nouns set
candidates = [c for c in candidates if c in all_nouns]

## Transformer approach with BERT

In [170]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([article_text])
candidate_embeddings = model.encode(candidates)

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords

['massive alpha',
 'classification algorithms',
 'machine learning',
 'data science',
 'data scientist']

In [171]:
def max_sum_sim(doc_embedding, word_embeddings,top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

nr_candidates=10
keywords = max_sum_sim(doc_embedding, candidate_embeddings, top_n,nr_candidates)
keywords

['correct predictions',
 'decision trees',
 'massive alpha',
 'machine learning',
 'data scientist']

## Latent Dirichlet Allocation (LDA)

In [219]:
top_n = 1
text_vectorized = vectorizer.transform([article_text])
lda_model = LatentDirichletAllocation(n_components=top_n, max_iter=10, learning_method='online')
data_lda = lda_model.fit_transform(text_vectorized)
for topic in lda_model.components_:
    topics = [vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1][:20]]

# Combine 1-grams to 2-grams when 2-gram is in topics list
newtopics = []
wordsused = []
for i,top1 in enumerate(topics):
    for j,top2 in enumerate(topics[i+1:]):
        if ' '.join([top1,top2]) in topics:
            if ' '.join([top1,top2]) not in newtopics:
                newtopics.append(' '.join([top1,top2]))
                wordsused.extend([top1,top2])
        elif ' '.join([top2,top1]) in topics:
            if ' '.join([top2,top1]) not in newtopics:
                newtopics.append(' '.join([top2,top1]))
                wordsused.extend([top1,top2])
    if (top1 not in wordsused) and (top1 not in newtopics):
        newtopics.append(top1)
newtopics

['random', 'forest', 'trees', 'tree', 'random forest', 'data', 'game', 'feature', 'decision', 'individual', 'split', 'different', 'node', 'make', 'expected', 'features', 'decision tree', 'training', 'like', 'uncorrelated']


['random forest',
 'trees',
 'decision tree',
 'data',
 'game',
 'feature',
 'individual',
 'split',
 'different',
 'node',
 'make',
 'expected',
 'features',
 'training',
 'like',
 'uncorrelated']

## Non-negative matrix factorization (NMF)

In [222]:
top_n = 1
text_vectorized = vectorizer.transform([article_text])
nmf_model = NMF(n_components=top_n,max_iter=500)
data_nmf = nmf_model.fit_transform(text_vectorized)
for topic in nmf_model.components_:
    topics = [vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1][:20]]

# Combine 1-grams to 2-grams when 2-gram is in topics list
newtopics = []
wordsused = []
for i,top1 in enumerate(topics):
    for j,top2 in enumerate(topics[i+1:]):
        if ' '.join([top1,top2]) in topics:
            if ' '.join([top1,top2]) not in newtopics:
                newtopics.append(' '.join([top1,top2]))
                wordsused.extend([top1,top2])
        elif ' '.join([top2,top1]) in topics:
            if ' '.join([top2,top1]) not in newtopics:
                newtopics.append(' '.join([top2,top1]))
                wordsused.extend([top1,top2])
    if (top1 not in wordsused) and (top1 not in newtopics):
        newtopics.append(top1)
newtopics



['random forest',
 'decision tree',
 'trees',
 'data',
 'game',
 'feature',
 'individual',
 'different',
 'split',
 'features',
 'make',
 'expected',
 'node',
 'like',
 'uncorrelated',
 'model']