<a href="https://colab.research.google.com/github/AravindReddy123/Aravind_INFO5731_Spring2023/blob/main/In_class_exercise_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **The fourth in-class-exercise (40 points in total, 03/28/2022)**

Question description: Please use the text corpus you collected in your last in-class-exercise for this exercise. Perform the following tasks:

## (1) (10 points) Generate K topics by using LDA, the number of topics K should be decided by the coherence score, then summarize what are the topics. You may refer the code here: 

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [None]:
# Write your code here


import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import nltk
from nltk.corpus import stopwords
import pandas as pd

# Load the dataset
documents = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#documents = [...] # your list of documents
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')
# Preprocess the documents
stop_words = stopwords.words('english')
preprocessed_documents = []
for document in documents:
    tokens = nltk.word_tokenize(document.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    preprocessed_documents.append(tokens)

# Create a dictionary and corpus
dictionary = corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(document) for document in preprocessed_documents]

# Compute coherence scores for different numbers of topics
coherence_scores = []
for k in range(2, 20):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=k)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    coherence_scores.append(coherence_score)
    print(f"Number of topics: {k}, coherence score: {coherence_score}")

# Select the value of K that maximizes the coherence score
best_k = coherence_scores.index(max(coherence_scores)) + 2

# Train the final LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=best_k)

# Print the topics
topics = lda_model.show_topics(num_topics=best_k, formatted=False)
for i, topic in enumerate(topics):
    print(f"Topic {i}: {' '.join([word[0] for word in topic[1]])}")



## (2) (10 points) Generate K topics by using LSA, the number of topics K should be decided by the coherence score, then summarize what are the topics. You may refer the code here:

https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

In [None]:
# Write your code here

import nltk
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from nltk.corpus import stopwords

# Load the corpus and preprocess the text
nltk.download('stopwords')
stop_words = stopwords.words('english')
documents = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
texts = [[word for word in document.lower().split() if word not in stop_words] for document in documents]

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Determine the optimal number of topics based on coherence score
coherence_scores = []
for k in range(1, 10):
    lsa_model = models.LsiModel(corpus, num_topics=k, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsa_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_scores.append(coherence_model.get_coherence())

optimal_k = coherence_scores.index(max(coherence_scores)) + 1

# Generate K topics using LSA
lsa_model = models.LsiModel(corpus, num_topics=optimal_k, id2word=dictionary)
topics = lsa_model.print_topics(num_topics=optimal_k)

# Print the topics
for topic in topics:
    print(topic)


In [None]:
pip install pyLDAvis

## (3) (10 points) Generate K topics by using  lda2vec, the number of topics K should be decided by the coherence score, then summarize what are the topics. You may refer the code here:

https://nbviewer.org/github/cemoody/lda2vec/blob/master/examples/twenty_newsgroups/lda2vec/lda2vec.ipynb

In [12]:
# Write your code here

# Import necessary libraries
import numpy as np
import tensorflow as tf
import os
import collections
import random
import lda2vec
import lda2vec.utils
import pandas as pd
import spacy
from sklearn.datasets import fetch_20newsgroups
# Load your corpus, in this example, I'll use a sample dataset of news articles
#corpus_df = pd.read_csv('news_corpus.csv')
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
print(newsgroups)
corpus_df = newsgroups.data
print(corpus_df)

# Preprocess the corpus (tokenization, stopword removal, stemming, etc.)
# Here's an example using spaCy library
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def tokenize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-']

corpus = [tokenize(doc) for doc in corpus_df['text']]

# Set up corpus dictionary and mapping for LDA2Vec
vocab = lda2vec.utils.flatten(corpus)
word2id = dict(zip(vocab, range(len(vocab))))
id2word = {v: k for k, v in word2id.items()}
corpus = [np.array([word2id[word] for word in doc]) for doc in corpus]

# Train LDA2Vec model and determine optimal number of topics using coherence score
coherence_scores = {}
for k in range(2, 20):
    # Train LDA2Vec with k topics
    model = lda2vec.LDA2Vec(n_topics=k, n_iter=500, random_state=42)
    model.fit(corpus)

    # Calculate coherence score
    m = model.mixture_means
    s = model.mixture_variances
    topics = m.dot(s).argsort()[:, ::-1]
    tokens = collections.defaultdict(list)
    for i, topic in enumerate(topics):
        tokens[topic[0]].extend([id2word[token] for token in topic[1:11]])
    coherence_score = lda2vec.utils.evaluate(tokens)
    coherence_scores[k] = coherence_score

# Select number of topics with highest coherence score
optimal_k = max(coherence_scores, key=coherence_scores.get)
print("Optimal number of topics:", optimal_k)

# Train LDA2Vec model with optimal number of topics
model = lda2vec.LDA2Vec(n_topics=optimal_k, n_iter=500, random_state=42)
model.fit(corpus)

# Print topics and top 10 words in each topic
m = model.mixture_means
s = model.mixture_variances
topics = m.dot(s).argsort()[:, ::-1]
tokens = collections.defaultdict(list)
for i, topic in enumerate(topics):
    tokens[topic[0]].extend([id2word[token] for token in topic[1:11]])
for i, words in tokens.items():
    print("Topic {}: {}".format(i+1, ", ".join(words)))


In [None]:
# Write your code here

# Import necessary libraries
import numpy as np
import tensorflow as tf
import os
import collections
import random
import lda2vec
import lda2vec.utils
import pandas as pd
import spacy
from sklearn.datasets import fetch_20newsgroups
from lda2vec import Corpus
# Load your corpus, in this example, I'll use a sample dataset of news articles
#corpus_df = pd.read_csv('news_corpus.csv')
#categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
#newsgroups = fetch_20newsgroups(subset='all', categories=categories)
#print(newsgroups)
corpus_df =documents = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(corpus_df)

# Preprocess the corpus (tokenization, stopword removal, stemming, etc.)
# Here's an example using spaCy library
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def tokenize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-']

corpus = [tokenize(doc) for doc in corpus_df['content']]

# Set up corpus dictionary and mapping for LDA2Vec
corpus1 = Corpus()
vocab = corpus1.compact_to_flat(corpus)
word2id = dict(zip(vocab, range(len(vocab))))
id2word = {v: k for k, v in word2id.items()}
corpus = [np.array([word2id[word] for word in doc]) for doc in corpus]

# Train LDA2Vec model and determine optimal number of topics using coherence score
coherence_scores = {}
for k in range(2, 20):
    # Train LDA2Vec with k topics
    model = lda2vec.LDA2Vec(n_topics=k, n_iter=500, random_state=42)
    model.fit(corpus)

    # Calculate coherence score
    m = model.mixture_means
    s = model.mixture_variances
    topics = m.dot(s).argsort()[:, ::-1]
    tokens = collections.defaultdict(list)
    for i, topic in enumerate(topics):
        tokens[topic[0]].extend([id2word[token] for token in topic[1:11]])
    coherence_score = lda2vec.utils.evaluate(tokens)
    coherence_scores[k] = coherence_score

# Select number of topics with highest coherence score
optimal_k = max(coherence_scores, key=coherence_scores.get)
print("Optimal number of topics:", optimal_k)

# Train LDA2Vec model with optimal number of topics
model = lda2vec.LDA2Vec(n_topics=optimal_k, n_iter=500, random_state=42)
model.fit(corpus)

# Print topics and top 10 words in each topic
m = model.mixture_means
s = model.mixture_variances
topics = m.dot(s).argsort()[:, ::-1]
tokens = collections.defaultdict(list)
for i, topic in enumerate(topics):
    tokens[topic[0]].extend([id2word[token] for token in topic[1:11]])
for i, words in tokens.items():
    print("Topic {}: {}".format(i+1, ", ".join(words)))


In [None]:
pip install bertopic

## (4) (10 points) Generate K topics by using BERTopic, the number of topics K should be decided by the coherence score, then summarize what are the topics. You may refer the code here: 

https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing

In [None]:
# Write your code here


import numpy as np
import pandas as pd
import gensim
from sklearn.datasets import fetch_20newsgroups
from bertopic import BERTopic

# Load the corpus
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
texts = newsgroups.data
# Create an instance of BERTopic
model = BERTopic()
# Determine the optimal number of topics based on coherence score
coherence_scores = {}
model.fit(texts)
topics, _ = model.transform(texts)
topic_words = model.get_topic_info()
topics_words_list = [topic_words[topic]["tokens"].split() for topic in topic_words.index]
dictionary = gensim.corpora.Dictionary(topics_words_list)
corpus = [dictionary.doc2bow(words) for words in topics_words_list]
    #coherence_scores[k] = model.get_coherence_per_topic()
coherence_scores = []
for k in range(2, 11):
    # Create coherence model
    coherence_model = gensim.models.CoherenceModel(topics=topics_words_list, corpus=corpus, dictionary=dictionary, coherence='c_v', num_topics=k)
    
    # Calculate coherence score
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)
    print(f"K={k}: Coherence Score={coherence_score:.4f}")

## (5) (10 extra points) Compare the results generated by the four topic modeling algorithms, which one is better? You should explain the reasons in details.

In [None]:
# Write your answer here (no code needed for this question)



