In [None]:
!pip install pyLDAvis umap unidecode
!pip install sentence-transformers umap-learn hdbscan bertopic pyyaml==5.4.1

In [None]:
import sys
import umap, re, string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lib.plot_helper as phelper
import unidecode
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
import spacy, nltk
from nltk.corpus import stopwords

nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1) Latent Dirichlet Allocation

In [None]:
def text_replace_series(series):
    series = series.str.replace("(<br/>)", "")
    series = series.str.replace('(<a).*(>).*(</a>)', '')
    series = series.str.replace('(&amp)', '')
    series = series.str.replace('(&gt)', '')
    series = series.str.replace('(&lt)', '')
    series = series.str.replace('(\xa0)', ' ')
    return series

def preprocess_text(text, token=True):
    text = re.sub('[^a-zA-Z#]', ' ', text)
    text = unidecode.unidecode(text)
    text = [t.lower() for t in text.split() if len(t)>3]
    text = [t for t in text if t not in stop_words]
    # sent = nlp(' '.join(text)
    # text = [t.lemma_ for t in sent]

    # Join the characters again to form the string.
    if token == False:

    return text

In [None]:
npr = pd.read_csv('https://raw.githubusercontent.com/ChanCheeKean/datasets/main/nlp/npr.csv')
npr['Article'] = npr['Article'].apply(lambda x: preprocess_text(x, token=False))
npr.head()

Unnamed: 0,Article
0,washington even policy bipartisan politics can...
1,donald trump used twitter preferred means comm...
2,donald trump unabashedly praising russian pres...
3,updated russian president vladimir putin says ...
4,photography illustration video data visualizat...


In [None]:
# add new stopwords
# max_df ignore terms that have a document frequency strictly higher than the given threshold, show up too frequent
# min_df ignore terms that have a document frequency strictly lower than the given threshold, too rare
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words=nlp.Defaults.stop_words)
dtm = cv.fit_transform(npr['Article'])

# must give the number of topics(n_components)
# doc_topic_prior, Higher the alpha, documents are composed of more topics
# topic_word_prior, Higher the beta, topics are composed of a large number of words
LDA = LatentDirichletAllocation(n_components=10, random_state=42, max_iter=20, n_jobs=-1)
# LDA.fit(dtm)
lda_topic_matrix = LDA.fit_transform(dtm)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ll', 've'] not in stop_words.



In [None]:
# lda_topic_matrix comprise of 6 probabilties (of 10 topics) for 22641 documents
# lda_keys contains the topic each document belong to
# lda_dict stores the total number of documents of each topics
lda_keys = lda_topic_matrix.argmax(axis=1).tolist()
lda_dicts = {a : b for a, b in Counter(lda_keys).items()}
lda_dicts

{0: 2816,
 1: 757,
 2: 925,
 3: 939,
 4: 1018,
 5: 1207,
 6: 1051,
 7: 331,
 8: 1604,
 9: 1344}

In [None]:
# get the number of vocab
print(len(cv.get_feature_names()))
print(cv.get_feature_names()[6000])

51763
bumbling


In [None]:
# 10 topic, with assigned probabilities to each vocab
print(lda_topic_matrix.shape)
print(LDA.components_.shape)

(11992, 10)
(10, 51763)


In [None]:
single_topic = LDA.components_[0]
# returns the indices of that Top 5 words for this topic with highest probabilities.
[cv.get_feature_names()[index] for index in single_topic.argsort()[-10:]]

['going',
 'world',
 'years',
 'life',
 'know',
 'think',
 'time',
 'people',
 'says',
 'like']

In [None]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(LDA.components_):
    top_words = [cv.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)
    top_words_dict[index] = ' '.join(top_words)

['going', 'world', 'years', 'life', 'know', 'think', 'time', 'people', 'says', 'like']
['year', 'national', 'farmers', 'land', 'climate', 'years', 'like', 'water', 'food', 'says']
['political', 'campaign', 'country', 'women', 'party', 'people', 'president', 'trump', 'clinton', 'said']
['state', 'told', 'justice', 'case', 'court', 'says', 'black', 'people', 'police', 'said']
['money', 'court', 'million', 'companies', 'said', 'state', 'government', 'federal', 'company', 'says']
['russia', 'news', 'administration', 'campaign', 'white', 'obama', 'house', 'president', 'said', 'trump']
['parents', 'album', 'education', 'song', 'schools', 'says', 'like', 'music', 'students', 'school']
['election', 'sanders', 'cruz', 'states', 'vote', 'clinton', 'voters', 'state', 'percent', 'trump']
['patients', 'medical', 'research', 'like', 'study', 'percent', 'care', 'people', 'health', 'says']
['told', 'attack', 'reports', 'country', 'government', 'military', 'city', 'people', 'said', 'says']


In [None]:
npr['Topic'] = lda_topic_matrix.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,washington even policy bipartisan politics can...,5
1,donald trump used twitter preferred means comm...,5
2,donald trump unabashedly praising russian pres...,5
3,updated russian president vladimir putin says ...,5
4,photography illustration video data visualizat...,4


In [None]:
phelper.get_bar(
    list(lda_dicts.keys()),
    list(lda_dicts.values()),
    ytitle='Number of Topics',
    title='LDA topic counts')

Filter based on Part of Speech Tag to improve Accuracy <br/>
In order to retrieve most important topic terms, a corpus can be divided into batches of fixed sizes.

# 2) Latent Semantic Analysis

In [None]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].apply(lambda x: preprocess_text(x, token=False))
news_df.head()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure story seem biased disagree statement...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize principle strongest points wo...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss proposal much ...
4,"Well, I will have to change the scoring on my ...",well change scoring playoff pool unfortunately...


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, smooth_idf=True)
X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape

(11314, 64741)

In [None]:
# doc_topic_prior, Higher the alpha, documents are composed of more topics
# topic_word_prior, Higher the beta, topics are composed of a large number of words

lsa_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
lsa_topic_matrix = lsa_model.fit_transform(X)
len(lsa_model.components_)

20

In [None]:
lsa_keys = lsa_topic_matrix.argmax(axis=1).tolist()
lsa_dicts = {a : b for a, b in Counter(lsa_keys).items()}
lsa_dicts

{0: 7457,
 1: 323,
 2: 267,
 3: 788,
 4: 272,
 5: 110,
 6: 407,
 7: 250,
 8: 7,
 9: 190,
 10: 124,
 11: 202,
 12: 210,
 13: 69,
 14: 128,
 15: 91,
 16: 98,
 17: 40,
 18: 192,
 19: 89}

In [None]:
# 10 topic, with assigned probabilities to each vocab
print(lsa_topic_matrix.shape)
print(lsa_model.components_.shape)

(11314, 20)
(20, 64741)


In [None]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(lsa_model.components_):
    top_words = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)

['want', 'need', 'windows', 'thanks', 'time', 'good', 'think', 'people', 'like', 'know']
['files', 'mail', 'disk', 'video', 'scsi', 'file', 'card', 'drive', 'thanks', 'windows']
['cable', 'power', 'card', 'floppy', 'disk', 'controller', 'hard', 'drives', 'scsi', 'drive']
['league', 'play', 'hockey', 'thanks', 'season', 'players', 'year', 'games', 'team', 'game']
['number', 'algorithm', 'phone', 'data', 'escrow', 'keys', 'government', 'clipper', 'encryption', 'chip']
['soon', 'banks', 'gordon', 'surrender', 'skepticism', 'intellect', 'shameful', 'chastity', 'cadre', 'pitt']
['post', 'information', 'looking', 'address', 'email', 'info', 'advance', 'know', 'mail', 'thanks']
['modem', 'sale', 'price', 'driver', 'chip', 'drivers', 'cards', 'monitor', 'video', 'card']
['team', 'government', 'clipper', 'game', 'encryption', 'scsi', 'chip', 'windows', 'thanks', 'know']
['state', 'arab', 'government', 'jews', 'people', 'turkish', 'israeli', 'armenians', 'armenian', 'israel']
['really', 'right',

# 30 Non-negative Matrix Factorization

Use NMF(highest coefficients) instead of LDA (highest probabilites).

In [None]:
npr = pd.read_csv('https://raw.githubusercontent.com/ChanCheeKean/datasets/main/nlp/npr.csv')
npr['Article'] = npr['Article'].apply(lambda x: preprocess_text(x, token=False))
npr.head()

Unnamed: 0,Article
0,washington even policy bipartisan politics can...
1,donald trump used twitter preferred means comm...
2,donald trump unabashedly praising russian pres...
3,updated russian president vladimir putin says ...
4,photography illustration video data visualizat...


In [None]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article'])
nmf_model = NMF(n_components=10, random_state=42)
nmf_topic_matrix = nmf_model.fit_transform(dtm)

In [None]:
print(len(tfidf.get_feature_names()))
print(nmf_model.components_.shape)
print(nmf_topic_matrix.shape)

51757
(10, 51757)
(11992, 10)


In [None]:
nmf_keys = nmf_topic_matrix.argmax(axis=1).tolist()
nmf_dicts = {a : b for a, b in Counter(nmf_keys).items()}
nmf_dicts

{0: 2567,
 1: 1171,
 2: 654,
 3: 573,
 4: 1402,
 5: 3116,
 6: 667,
 7: 355,
 8: 679,
 9: 808}

In [None]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(nmf_model.components_):
    top_words = [tfidf.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)
    top_words_dict[index] = ' '.join(top_words)

['study', 'company', 'percent', 'china', 'years', 'like', 'water', 'food', 'people', 'says']
['presidential', 'obama', 'house', 'white', 'republican', 'campaign', 'donald', 'said', 'president', 'trump']
['republicans', 'plan', 'people', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']
['vote', 'party', 'state', 'delegates', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']
['forces', 'people', 'killed', 'reports', 'city', 'isis', 'attack', 'officers', 'said', 'police']
['women', 'life', 'people', 'album', 'know', 'really', 'song', 'think', 'like', 'music']
['parents', 'children', 'college', 'kids', 'teachers', 'student', 'education', 'schools', 'school', 'students']
['pregnant', 'microcephaly', 'cases', 'health', 'mosquitoes', 'disease', 'mosquito', 'women', 'virus', 'zika']
['putin', 'said', 'investigation', 'committee', 'president', 'flynn', 'intelligence', 'russian', 'comey', 'russia']
['case', 'order', 'president', 'state', 'feder

In [None]:
topic_results = nmf_model.transform(dtm)
npr['Topic'] = topic_results.argmax(axis=1)
npr.head(10)

Unnamed: 0,Article,Topic
0,washington even policy bipartisan politics can...,8
1,donald trump used twitter preferred means comm...,1
2,donald trump unabashedly praising russian pres...,8
3,updated russian president vladimir putin says ...,8
4,photography illustration video data visualizat...,6
5,want join yoga class hated beatific instructor...,5
6,publicly supported debunked claim vaccines cau...,0
7,standing airport exit debating whether snack y...,0
8,movies trying realistic perhaps summon batman ...,0
9,eighteen years year david fisher visited farm ...,0


In [None]:
phelper.get_bar(
    list(nmf_dicts.keys()),
    list(nmf_dicts.values()),
    ytitle='Number of Topics',
    title='NMF topic counts')

# 4) BerTopic

[Reference](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6)

[Acceleration](https://medium.com/rapids-ai/accelerating-topic-modeling-with-rapids-and-bert-models-be9909eeed2)

[Github](https://github.com/MaartenGr/BERTopic)

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
import hdbscan
import umap
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

### Embeddings
data = fetch_20newsgroups(subset='test')['data']
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(data[:200], show_progress_bar=True)

###
umap_embeddings = umap.UMAP(n_neighbors=15,
                            n_components=5,
                            metric='cosine').fit_transform(embeddings)

cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',
                          cluster_selection_method='eom').fit(umap_embeddings)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
### c-tf-idf

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    # count of words in corpus
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    # shape = (n_class, 8553)
    t = count.transform(documents).toarray()
    tf = np.divide(t.T, t.sum(axis=1))
    # shape = (8353,)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    # shape = (8353, n_class)
    tf_idf = np.multiply(tf, idf)
    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    # length = 8553
    words = count.get_feature_names_out()
    # length = n_class
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    # shape = (n_class, n)
    indices = tf_idf_transposed.argsort()[:, -n:]

    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

# combining all documents in same category as one
docs_df = pd.DataFrame(data[:200], columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

# find idf of each word in each document (8353, 3)
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(data))

# extract top words in each class
# topic name-1 refers to all documents that did not have any topics assigned.
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
print(top_n_words[0])

topic_sizes = extract_topic_sizes(docs_df)
topic_sizes.head(10)

[('edu', 0.06466089766227776), ('lines', 0.0486984982271199), ('subject', 0.048433677479566256), ('organization', 0.0456298712897377), ('university', 0.03711997733662), ('like', 0.027784886020919834), ('thanks', 0.02760292168944958), ('posting', 0.025378051608232605), ('nntp', 0.024339618799646304), ('host', 0.02415934488064391), ('et4000', 0.02193623298312932), ('know', 0.02163521671079672), ('disk', 0.020099952967842585), ('com', 0.0197870235118431), ('10', 0.019688092223412203), ('just', 0.01869961697448718), ('mail', 0.018144810328885062), ('drivers', 0.017947826986196718), ('13', 0.016751167657754258), ('grips', 0.016731763831636427)]


Unnamed: 0,Topic,Size
0,-1,92
2,1,64
1,0,44


In [None]:
### topic reduction, merging the topic vectors that were most similar to each other
# or tweak min_cluster_size in hdbscan

for i in range(20):
    # Calculate cosine similarity
    similarities = cosine_similarity(tf_idf.T)
    np.fill_diagonal(similarities, 0)

    # Extract label to merge into and from where
    # merge the topic with least doc
    topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
    topic_to_merge = topic_sizes.iloc[-1].Topic
    # the group index start from -1, thus + 1 in indexing
    # argmax to find the nearest class
    topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

    # Adjust topics
    # change the topic name for affected document
    docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
    old_topics = docs_df.sort_values("Topic").Topic.unique()

    # number of classes decreases, 10 --> 9
    map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
    docs_df.Topic = docs_df.Topic.map(map_topics)
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

    # Re-Calculate new topic words
    m = len(data)
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

{-1: -1, 1: 0}


In [None]:
### use standard package
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='test',  remove=('headers', 'footers', 'quotes'))['data']
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

# 5) Top2Vec
[Github](https://github.com/ddangelov/Top2Vec)