# Topic Modelling LDA, Bert

In [None]:
!pip install bertopic

## Importing Libraries

In [None]:
import pandas as pd
import nltk
import spacy
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

from bertopic import BERTopic

In [None]:
nlp = spacy.load("en_core_web_sm")

## Loading Data

In [None]:
data = pd.read_csv("/kaggle/input/abc-news-sample/abcnews_sample.csv")
data.head()

## Text PreProceessing

In [None]:
data['splitted_text'] = [x.split() for x in data['headline_text']]

### Removing Stopwords

In [None]:
processed_texts = [remove_stopwords(x).split() for x in data['headline_text']]

### Stemming

In [None]:
ss = SnowballStemmer("english")
stemmed_texts = [[ss.stem(word) for word in text] for text in processed_texts]

### Lemmatization

In [None]:
processed_texts = [[token.lemma_ for token in nlp(" ".join(text))] for text in stemmed_texts]

# Gensim

In [None]:
dictionary = gensim.corpora.Dictionary(processed_texts)

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_texts]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=3, 
                                       id2word = dictionary, 
                                       passes = 20)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(topic, idx ))
    print("\n")

# Sklearn

In [None]:
processed_texts = [' '.join(x) for x in processed_texts]

In [None]:
vectorizer = CountVectorizer()

dtm = vectorizer.fit_transform(processed_texts)

In [None]:
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(dtm)

In [None]:
perplexities = []
log_likelihoods = []
for num_topics in range(2,11):
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(dtm)
    
    perplexities.append(lda.perplexity(dtm))
    log_likelihoods.append(lda.score(dtm))

In [None]:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(range(2,11), perplexities, marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity (Lower is Better)")
plt.title("Perplexity vs Number of Topics")

plt.subplot(1, 2, 2)
plt.plot(range(2,11), log_likelihoods, marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Log-Likelihood (Higher is Better)")
plt.title("Log-Likelihood vs Number of Topics")

plt.tight_layout()
plt.show()

In [None]:
!pip install bertopic

In [None]:
model = BERTopic(verbose=True,embedding_model='paraphrase-MiniLM-L3-v2', min_topic_size= 7)
headline_topics, _ = model.fit_transform(processed_texts)

In [None]:
freq = model.get_topic_info()

In [None]:
len(freq['Topic'].unique())

In [None]:
model.get_topic(freq.iloc[1]['Topic'])

In [None]:
model.visualize_barchart(top_n_topics=6)

In [None]:
model.visualize_topics()