In [None]:
!pip install pyLDAvis umap unidecode
!pip install sentence-transformers umap-learn hdbscan bertopic pyyaml==5.4.1

In [None]:
import sys
import umap, re, string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lib.plot_helper as phelper
import unidecode
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from tqdm import tqdm
import spacy, nltk
from nltk.corpus import stopwords

nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1) Latent Dirichlet Allocation

1. p(topic t | document d): the proportion of words in document d that are assigned to topic t. Tries to capture how many words belong to the topic t for a given document d. Excluding the current word. If a lot of words from d belongs to t, it is more probable that word w belongs to t.
( #words in d with t +alpha/ #words in d with any topic+ k*alpha)

2. p(word w| topic t): the proportion of assignments to topic t over all documents that come from this word w. Tries to capture how many documents are in topic t because of word w.
LDA represents documents as a mixture of topics. Similarly, a topic is a mixture of words. If a word has high probability of being in a topic, all the documents having w will be more strongly associated with t as well. Similarly, if w is not very probable to be in t, the documents which contain the w will be having very low probability of being in t, because rest of the words in d will belong to some other topic and hence d will have a higher probability for those topic. So even if w gets added to t, it won’t be bringing many such documents to t.


[Beginner’s Guide to LDA](https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2) | [Scratch](https://www.depends-on-the-definition.com/lda-from-scratch/)

## 1.1 Python Implementation

In [13]:
data, _ = fetch_20newsgroups(
    shuffle=True, random_state=1,
    remove=('headers', 'footers', 'quotes'),
    return_X_y=True,
)
data_samples = data[:10000]
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=20000, stop_words='english'
)
tf = tf_vectorizer.fit_transform(data_samples)
vocabulary = tf_vectorizer.vocabulary_
print('Data Size', tf.shape)
print(len(vocabulary))

Data Size (10000, 20000)
20000


In [21]:
docs = []
for row in tf.toarray():
    # [2020,  2859,  2920,  2984,  3369, ...]
    present_words = np.where(row != 0)[0].tolist()
    present_words_with_count = []
    for word_idx in present_words:
        for count in range(row[word_idx]):
            present_words_with_count.append(word_idx)
    docs.append(present_words_with_count)

print(docs[0])

[2020, 2859, 2920, 2984, 3369, 3487, 4543, 4775, 5655, 5881, 6009, 6141, 6226, 7225, 7227, 7345, 8515, 8525, 8687, 8887, 9144, 9417, 9722, 10074, 10074, 10077, 10077, 10191, 10856, 11010, 11085, 11338, 11622, 11622, 11622, 11622, 12807, 13969, 14177, 14670, 14836, 14846, 14871, 15177, 15183, 15209, 15664, 16216, 16650, 16979, 17091, 17227, 17377, 17854, 18202, 18280, 18530, 19511]


In [22]:
D = len(docs)
V = len(vocabulary)
T = 10
# doc_topic_prior, Higher the alpha, higher topic per documents
alpha = 1 / T
# topic_word_prior, higher beta, higher number of words per topic
beta = 1 / T

z_d_n = [[0 for _ in range(len(d))] for d in docs]
theta_d_z = np.zeros((D, T))
phi_z_w = np.zeros((T, V))
n_d = np.zeros((D))
n_z = np.zeros((T))

## Initialize the parameters
for d, doc in enumerate(docs):
    for n, w in enumerate(doc):
        # assign a topic randomly to words
        z_d_n[d][n] = n % T

        # get the topic for word n in document m
        z = z_d_n[d][n]

        # keep track of our counts
        theta_d_z[d][z] += 1
        phi_z_w[z, w] += 1
        n_z[z] += 1
        n_d[d] += 1

for iteration in tqdm(range(10)):
    for d, doc in enumerate(docs):
        for n, w in enumerate(doc):

            # get the topic for word n in document m
            z = z_d_n[d][n]

            # decrement counts for word w with associated topic z
            theta_d_z[d][z] -= 1
            phi_z_w[z, w] -= 1
            n_z[z] -= 1

            # sample new topic from a multinomial according to our formula
            p_d_t = (theta_d_z[d] + alpha) / (n_d[d] - 1 + T * alpha)
            p_t_w = (phi_z_w[:, w] + beta) / (n_z + V * beta)
            p_z = p_d_t * p_t_w
            p_z /= np.sum(p_z)
            new_z = np.random.multinomial(1, p_z).argmax()

            # set z as the new topic and increment counts
            z_d_n[d][n] = new_z
            theta_d_z[d][new_z] += 1
            phi_z_w[new_z, w] += 1
            n_z[new_z] += 1

## 1.2 SKLearn Library

In [None]:
def text_replace_series(series):
    series = series.str.replace("(<br/>)", "")
    series = series.str.replace('(<a).*(>).*(</a>)', '')
    series = series.str.replace('(&amp)', '')
    series = series.str.replace('(&gt)', '')
    series = series.str.replace('(&lt)', '')
    series = series.str.replace('(\xa0)', ' ')
    return series

def preprocess_text(text, token=True):
    text = re.sub('[^a-zA-Z#]', ' ', text)
    text = unidecode.unidecode(text)
    text = [t.lower() for t in text.split() if len(t)>3]
    text = [t for t in text if t not in stop_words]
    # sent = nlp(' '.join(text)
    # text = [t.lemma_ for t in sent]

    # Join the characters again to form the string.
    if token == False:
        text = " ".join(text)
    return text

In [None]:
npr = pd.read_csv('https://raw.githubusercontent.com/ChanCheeKean/datasets/main/nlp/npr.csv')
npr['Article'] = npr['Article'].apply(lambda x: preprocess_text(x, token=False))
npr.head()

Unnamed: 0,Article
0,washington even policy bipartisan politics can...
1,donald trump used twitter preferred means comm...
2,donald trump unabashedly praising russian pres...
3,updated russian president vladimir putin says ...
4,photography illustration video data visualizat...


In [None]:
# add new stopwords
# max_df ignore terms that show up too frequent, min_df ignore terms too rare
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words=nlp.Defaults.stop_words)
dtm = cv.fit_transform(npr['Article'])

# n_components: number of topics
# doc_topic_prior, Higher the alpha, documents are composed of more topics
# topic_word_prior, Higher the beta, topics are composed of a large number of words
LDA = LatentDirichletAllocation(n_components=10, random_state=42, max_iter=20, n_jobs=-1)
# LDA.fit(dtm)
lda_topic_matrix = LDA.fit_transform(dtm)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ll', 've'] not in stop_words.



In [None]:
# lda_topic_matrix comprise of 6 probabilties (of 10 topics) for 22641 documents
# lda_keys contains the topic each document belong to
# lda_dict stores the total number of documents of each topics
lda_keys = lda_topic_matrix.argmax(axis=1).tolist()
lda_dicts = {a : b for a, b in Counter(lda_keys).items()}
lda_dicts

{0: 2816,
 1: 757,
 2: 925,
 3: 939,
 4: 1018,
 5: 1207,
 6: 1051,
 7: 331,
 8: 1604,
 9: 1344}

In [None]:
# get the number of vocab
print(len(cv.get_feature_names()))
print(cv.get_feature_names()[6000])

51763
bumbling


In [None]:
# 10 topic, with assigned probabilities to each vocab
print(lda_topic_matrix.shape)
print(LDA.components_.shape)

(11992, 10)
(10, 51763)


In [None]:
single_topic = LDA.components_[0]
# returns the indices of that Top 5 words for this topic with highest probabilities.
[cv.get_feature_names()[index] for index in single_topic.argsort()[-10:]]

['going',
 'world',
 'years',
 'life',
 'know',
 'think',
 'time',
 'people',
 'says',
 'like']

In [None]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(LDA.components_):
    top_words = [cv.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)
    top_words_dict[index] = ' '.join(top_words)

['going', 'world', 'years', 'life', 'know', 'think', 'time', 'people', 'says', 'like']
['year', 'national', 'farmers', 'land', 'climate', 'years', 'like', 'water', 'food', 'says']
['political', 'campaign', 'country', 'women', 'party', 'people', 'president', 'trump', 'clinton', 'said']
['state', 'told', 'justice', 'case', 'court', 'says', 'black', 'people', 'police', 'said']
['money', 'court', 'million', 'companies', 'said', 'state', 'government', 'federal', 'company', 'says']
['russia', 'news', 'administration', 'campaign', 'white', 'obama', 'house', 'president', 'said', 'trump']
['parents', 'album', 'education', 'song', 'schools', 'says', 'like', 'music', 'students', 'school']
['election', 'sanders', 'cruz', 'states', 'vote', 'clinton', 'voters', 'state', 'percent', 'trump']
['patients', 'medical', 'research', 'like', 'study', 'percent', 'care', 'people', 'health', 'says']
['told', 'attack', 'reports', 'country', 'government', 'military', 'city', 'people', 'said', 'says']


In [None]:
npr['Topic'] = lda_topic_matrix.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,washington even policy bipartisan politics can...,5
1,donald trump used twitter preferred means comm...,5
2,donald trump unabashedly praising russian pres...,5
3,updated russian president vladimir putin says ...,5
4,photography illustration video data visualizat...,4


In [None]:
phelper.get_bar(
    list(lda_dicts.keys()),
    list(lda_dicts.values()),
    ytitle='Number of Topics',
    title='LDA topic counts')

# 2) Latent Semantic Analysis

In [None]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].apply(lambda x: preprocess_text(x, token=False))
news_df.head()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure story seem biased disagree statement...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize principle strongest points wo...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss proposal much ...
4,"Well, I will have to change the scoring on my ...",well change scoring playoff pool unfortunately...


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, smooth_idf=True)
X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape

(11314, 64741)

In [None]:
lsa_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
lsa_topic_matrix = lsa_model.fit_transform(X)
len(lsa_model.components_)

20

In [None]:
lsa_keys = lsa_topic_matrix.argmax(axis=1).tolist()
lsa_dicts = {a : b for a, b in Counter(lsa_keys).items()}
lsa_dicts

{0: 7457,
 1: 323,
 2: 267,
 3: 788,
 4: 272,
 5: 110,
 6: 407,
 7: 250,
 8: 7,
 9: 190,
 10: 124,
 11: 202,
 12: 210,
 13: 69,
 14: 128,
 15: 91,
 16: 98,
 17: 40,
 18: 192,
 19: 89}

In [None]:
# 10 topic, with assigned probabilities to each vocab
print(lsa_topic_matrix.shape)
print(lsa_model.components_.shape)

(11314, 20)
(20, 64741)


In [None]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(lsa_model.components_):
    top_words = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)

['want', 'need', 'windows', 'thanks', 'time', 'good', 'think', 'people', 'like', 'know']
['files', 'mail', 'disk', 'video', 'scsi', 'file', 'card', 'drive', 'thanks', 'windows']
['cable', 'power', 'card', 'floppy', 'disk', 'controller', 'hard', 'drives', 'scsi', 'drive']
['league', 'play', 'hockey', 'thanks', 'season', 'players', 'year', 'games', 'team', 'game']
['number', 'algorithm', 'phone', 'data', 'escrow', 'keys', 'government', 'clipper', 'encryption', 'chip']
['soon', 'banks', 'gordon', 'surrender', 'skepticism', 'intellect', 'shameful', 'chastity', 'cadre', 'pitt']
['post', 'information', 'looking', 'address', 'email', 'info', 'advance', 'know', 'mail', 'thanks']
['modem', 'sale', 'price', 'driver', 'chip', 'drivers', 'cards', 'monitor', 'video', 'card']
['team', 'government', 'clipper', 'game', 'encryption', 'scsi', 'chip', 'windows', 'thanks', 'know']
['state', 'arab', 'government', 'jews', 'people', 'turkish', 'israeli', 'armenians', 'armenian', 'israel']
['really', 'right',

# 3) Non-negative Matrix Factorization

Use NMF(highest coefficients) instead of LDA (highest probabilites).

In [None]:
npr = pd.read_csv('https://raw.githubusercontent.com/ChanCheeKean/datasets/main/nlp/npr.csv')
npr['Article'] = npr['Article'].apply(lambda x: preprocess_text(x, token=False))
npr.head()

Unnamed: 0,Article
0,washington even policy bipartisan politics can...
1,donald trump used twitter preferred means comm...
2,donald trump unabashedly praising russian pres...
3,updated russian president vladimir putin says ...
4,photography illustration video data visualizat...


In [None]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article'])
nmf_model = NMF(n_components=10, random_state=42)
nmf_topic_matrix = nmf_model.fit_transform(dtm)

In [None]:
print(len(tfidf.get_feature_names()))
print(nmf_model.components_.shape)
print(nmf_topic_matrix.shape)

51757
(10, 51757)
(11992, 10)


In [None]:
nmf_keys = nmf_topic_matrix.argmax(axis=1).tolist()
nmf_dicts = {a : b for a, b in Counter(nmf_keys).items()}
nmf_dicts

{0: 2567,
 1: 1171,
 2: 654,
 3: 573,
 4: 1402,
 5: 3116,
 6: 667,
 7: 355,
 8: 679,
 9: 808}

In [None]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(nmf_model.components_):
    top_words = [tfidf.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)
    top_words_dict[index] = ' '.join(top_words)

['study', 'company', 'percent', 'china', 'years', 'like', 'water', 'food', 'people', 'says']
['presidential', 'obama', 'house', 'white', 'republican', 'campaign', 'donald', 'said', 'president', 'trump']
['republicans', 'plan', 'people', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']
['vote', 'party', 'state', 'delegates', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']
['forces', 'people', 'killed', 'reports', 'city', 'isis', 'attack', 'officers', 'said', 'police']
['women', 'life', 'people', 'album', 'know', 'really', 'song', 'think', 'like', 'music']
['parents', 'children', 'college', 'kids', 'teachers', 'student', 'education', 'schools', 'school', 'students']
['pregnant', 'microcephaly', 'cases', 'health', 'mosquitoes', 'disease', 'mosquito', 'women', 'virus', 'zika']
['putin', 'said', 'investigation', 'committee', 'president', 'flynn', 'intelligence', 'russian', 'comey', 'russia']
['case', 'order', 'president', 'state', 'feder

In [None]:
topic_results = nmf_model.transform(dtm)
npr['Topic'] = topic_results.argmax(axis=1)
npr.head(10)

Unnamed: 0,Article,Topic
0,washington even policy bipartisan politics can...,8
1,donald trump used twitter preferred means comm...,1
2,donald trump unabashedly praising russian pres...,8
3,updated russian president vladimir putin says ...,8
4,photography illustration video data visualizat...,6
5,want join yoga class hated beatific instructor...,5
6,publicly supported debunked claim vaccines cau...,0
7,standing airport exit debating whether snack y...,0
8,movies trying realistic perhaps summon batman ...,0
9,eighteen years year david fisher visited farm ...,0


In [None]:
phelper.get_bar(
    list(nmf_dicts.keys()),
    list(nmf_dicts.values()),
    ytitle='Number of Topics',
    title='NMF topic counts')

# 4) BerTopic

[Topic Modeling with BERT](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6) | [Accelerating Topic modeling with RAPIDS and BERT models](https://medium.com/rapids-ai/accelerating-topic-modeling-with-rapids-and-bert-models-be9909eeed2) | [BerTopic Github](https://github.com/MaartenGr/BERTopic)

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
import hdbscan
import umap
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

### Embeddings
data = fetch_20newsgroups(subset='test')['data']
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(data[:200], show_progress_bar=True)

### Dimension Redution and Clustering
umap_embeddings = umap.UMAP(
    n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings)

cluster = hdbscan.HDBSCAN(
    min_cluster_size=15, metric='euclidean', cluster_selection_method='eom').fit(umap_embeddings)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
### c-tf-idf
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    # count of words in corpus
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)

    # shape = (n_class, 8553)
    t = count.transform(documents).toarray()
    # word frequency divide by sum of word in each documnts
    tf = np.divide(t.T, t.sum(axis=1))

    # sum of document count for each word, shape =(8353,)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    # shape = (8353, n_class)
    tf_idf = np.multiply(tf, idf)
    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    # length = 8553
    words = count.get_feature_names_out()
    # length = n_class
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    # shape = (n_class, n)
    indices = tf_idf_transposed.argsort()[:, -n:]

    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (
        df.groupby(['Topic'])
        .Doc
        .count()
        .reset_index()
        .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
        .sort_values("Size", ascending=False))
    return topic_sizes

# combining all documents in same category as one
docs_df = pd.DataFrame(data[:200], columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index=False).agg({'Doc': ' '.join})

# find idf of each word in each document (8353, 3)
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(data))

# extract top words in each class
# topic name-1 refers to all documents that did not have any topics assigned.
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
print(top_n_words[0])

topic_sizes = extract_topic_sizes(docs_df)
topic_sizes.head(10)

[('edu', 0.06466089766227776), ('lines', 0.0486984982271199), ('subject', 0.048433677479566256), ('organization', 0.0456298712897377), ('university', 0.03711997733662), ('like', 0.027784886020919834), ('thanks', 0.02760292168944958), ('posting', 0.025378051608232605), ('nntp', 0.024339618799646304), ('host', 0.02415934488064391), ('et4000', 0.02193623298312932), ('know', 0.02163521671079672), ('disk', 0.020099952967842585), ('com', 0.0197870235118431), ('10', 0.019688092223412203), ('just', 0.01869961697448718), ('mail', 0.018144810328885062), ('drivers', 0.017947826986196718), ('13', 0.016751167657754258), ('grips', 0.016731763831636427)]


Unnamed: 0,Topic,Size
0,-1,92
2,1,64
1,0,44


In [None]:
### topic reduction, merging the topic vectors that were most similar to each other
# or tweak min_cluster_size in hdbscan

for i in range(20):
    # Calculate cosine similarity
    similarities = cosine_similarity(tf_idf.T)
    np.fill_diagonal(similarities, 0)

    # Extract label to merge into and from where
    # merge the topic with least doc
    topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
    topic_to_merge = topic_sizes.iloc[-1].Topic
    # the group index start from -1, thus + 1 in indexing
    # argmax to find the nearest class
    topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

    # Adjust topics
    # change the topic name for affected document
    docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
    old_topics = docs_df.sort_values("Topic").Topic.unique()

    # number of classes decreases, 10 --> 9
    map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
    docs_df.Topic = docs_df.Topic.map(map_topics)
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

    # Re-Calculate new topic words
    m = len(data)
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

{-1: -1, 1: 0}


In [None]:
### use standard package
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='test',  remove=('headers', 'footers', 'quotes'))['data']
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

# 5) BerTopic with LLama

In [None]:
!pip install bertopic datasets accelerate bitsandbytes xformers adjustText

In [None]:
from datasets import load_dataset
from torch import bfloat16
import transformers
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    BitsAndBytesConfig,
)
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic.representation import (
    KeyBERTInspired, MaximalMarginalRelevance, TextGeneration)
from bertopic import BERTopic

In [None]:
dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
abstracts = dataset["abstract"]
titles = dataset["title"]
print(abstracts[13894])

Downloading readme:   0%|          | 0.00/986 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks in an encoder-decoder configuration. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer, based
solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to be
superior in quality while being more parallelizable and requiring significantly
less time to train. Our model achieves 28.4 BLEU on the WMT 2014
English-to-German translation task, improving over the existing best results,
including ensembles by over 2 BLEU. On the WMT 2014 English-to-French
translation task, our model establishes a new single-model state-of-the-art
BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction
of the training costs of the best models from the literature. We show that the
Transfor

In [None]:
model_id = 'daryl149/Llama-2-7b-hf'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

generator = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    temperature=0.0,
    max_new_tokens=500,
    repetition_penalty=1.1
)

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/745 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt = "Could you explain to me how 4-bit quantization works as if I am 5?"
res = generator(prompt)
print(res[0]["generated_text"])

There are two BERTopic-specific tags that are of interest, namely [DOCUMENTS] and [KEYWORDS]:

* **[DOCUMENTS]** contain the top 5 most relevant documents to the topic
* **[KEYWORDS]** contain the top 10 most relevant keywords to the topic as generated through c-TF-IDF

This template will be filled according to each topic. And finally, we can combine this into our final prompt:



In [None]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""

# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

prompt = system_prompt + example_prompt + main_prompt

In [None]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# Pre-reduce embeddings for visualization purposes
reduced_embeddings = umap_model.fit_transform(embeddings)

In [None]:
keybert = KeyBERTInspired()
mmr = MaximalMarginalRelevance(diversity=0.3)
llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "MMR": mmr,
}

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    top_n_words=10,
    verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

In [None]:
topic_model.get_topic_info()[1:4]

# 6) Topic Modeling with MMR and OpenAI

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan import HDBSCAN
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import openai
import os
openai.api_key = ''

In [None]:
ag_news_dataset = load_dataset("ag_news", split='test')
df_ag = pd.DataFrame(ag_news_dataset)
df_ag.sample(5)

In [None]:
# embeddings #
embed_model = SentenceTransformer("BAAI/bge-small-en")
text_embeddings = embed_model.encode(df_ag['text'].tolist(), show_progress_bar=True)
df_ag['embeddings'] = list(text_embeddings)

# dimension reduction #
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')
reduced_text_embeddings = umap_model.fit_transform(text_embeddings)

# clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=40,
    metric='euclidean',
    min_samples=5,
    prediction_data=False)

text_cluster = hdbscan_model.fit(reduced_text_embeddings)
df_ag['cluster'] = text_cluster.labels_
print(set(text_cluster.labels_))

In [None]:
top_n = 10
diversity = 0.5
cluster_dict = {}

for cluster, df in df_ag.groupby('cluster'):
    if cluster == -1:
        continue

    # find the most representative documents
    candidate_d = cosine_similarity(df['embeddings'].tolist(), df['embeddings'].tolist())
    candidate_d_sum = candidate_d.sum(axis=1)
    doc_list = [np.argmax(candidate_d.sum(axis=1))]
    candidates_idx = [i for i in range(len(df)) if i != doc_list[0]]

    # filter based on maximal marginal relevance
    for _ in range(top_n - 1):
        candidate_similarities = candidate_d.sum(axis=1)[candidates_idx]
        target_similarities = np.max(candidate_d[candidates_idx][:, doc_list], axis=1)

        # Calculate MMR
        mmr = (1 - diversity) * candidate_similarities - diversity * target_similarities
        # Update keywords & candidates
        mmr_idx = candidates_idx[np.argmax(mmr)]
        doc_list.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    cluster_dict[cluster] = {'doc': [df['text'].tolist()[idx] for idx in doc_list]}

In [None]:
def get_reponse(prompt, temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=100)
    return response.choices[0].message["content"]

for i in cluster_dict.keys():

    # prompt
    documents = "\n".join([f"{c + 1}.{text}" for c, text in enumerate(cluster_dict[i]['doc'])])
    delimiter = "####"
    prompt = f"""\
    You will be provided with multiple documents that form the same cluster. \
    The documents will be delimited with {delimiter} characters. \
    Your task is to define a topic title that is well representing all the listed documents.\
    Output starts with 'Title: '

    Documents:
    {delimiter}{documents}{delimiter}
    """

    # response from openai
    llm_output = get_reponse(prompt)
    cluster_dict[i]['topic'] = llm_output

In [None]:
idx = 9
print(cluster_dict[idx]['topic'])
print("\n")
for i, d in enumerate(cluster_dict[idx]['doc']):
    print(f'{i}. {d} \n')