In [62]:
import os
import gensim

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gsdmm import MovieGroupProcess

# Load data
data_folder = r"C:\Users\akhil\Desktop\NLP Assignment-2\comments1k"
docs = []
for file_name in os.listdir(data_folder):
    with open(os.path.join(data_folder, file_name), "r", encoding="utf-8") as f:
        doc = f.read()
        docs.append(doc)

# Create a CountVectorizer object and transform the documents
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(docs)

# LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=123)
lda.fit(X)

# Print out the top 8 words for each topic
for i, topic in enumerate(lda.components_):
    top_words = [vectorizer.get_feature_names()[j] for j in topic.argsort()[:-9:-1]]
    print("Topic {}: {}".format(i, top_words))

# # Visualize the LDA model
# lda_vis = pyLDAvis.gensim.prepare(lda, X, vectorizer)
# pyLDAvis.display(lda_vis)

# # GSDMM model
# mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30)
# y = mgp.fit(docs)

# Print out the top 8 words for each topic
for i, topic in enumerate(mgp.cluster_word_distribution):
    top_words = [vectorizer.get_feature_names()[j] for j in topic.argsort()[:-9:-1]]
    print("Topic {}: {}".format(i, top_words))

# # Visualize the GSDMM model
# doc_topic, topic_term, doc_lengths, term_frequency, vocab = mgp.get_topics()
# topic_term_dists = mgp.cluster_word_distribution
# gsdmm_vis = pyLDAvis.prepare(topic_term_dists, doc_topic, doc_lengths, vocab, term_frequency)
# pyLDAvis.display(gsdmm_vis)

# Biterm model
biterms = []
for doc in docs:
    words = doc.split()
    if len(words) > 1:
        biterms.extend(gensim.models.btm_corpus.Biterms(words, min_count=1).bitmerize(window=2))

dictionary = gensim.corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

btm = gensim.models.btm_model.BtmModel(biterms, dictionary, num_topics=10, passes=50)
print(btm.print_topics())

# # Visualize the Biterm model
# vis = gensim.models.btmvis.BtmVis(btm, corpus, dictionary)
# vis.display()

Topic 0: ['br', 'movie', 'film', 'like', 'story', 'great', 'good', 'life']
Topic 1: ['stewart', 'jeff', 'gannon', 'good', 'mann', 'like', 'dawson', 'movie']
Topic 2: ['br', 'film', 'series', 'good', 'like', 'just', 'story', 'star']
Topic 3: ['br', 'movie', 'film', 'good', 'like', 'just', 'great', 'movies']
Topic 4: ['movie', 'film', 'game', 'br', 'films', 'life', 'black', 'niven']
Topic 5: ['br', 'film', 'love', 'time', 'story', 'way', 'work', 'like']
Topic 6: ['br', 'film', 'movie', 'like', 'story', 'good', 'brosnan', 'just']
Topic 7: ['br', 'film', 'davies', 'christmas', 'scrooge', 'scott', 'story', 'people']
Topic 8: ['br', 'film', 'like', 'spielberg', 'david', 'best', 'movie', 'goldberg']
Topic 9: ['br', 'film', 'chess', 'luzhin', 'movie', 'world', 'watson', 'turturro']


NameError: name 'mgp' is not defined

In [60]:
!pip install git+https://github.com/rwalk/gsdmm.git

Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to c:\users\akhil\appdata\local\temp\pip-req-build-tmqr6c0g
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py): started
  Building wheel for gsdmm (setup.py): finished with status 'done'
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4631 sha256=c2a626272447408d257b708f0d7ca4e11e2a8bddb93ec6de1ac66afd97154dd9
  Stored in directory: C:\Users\akhil\AppData\Local\Temp\pip-ephem-wheel-cache-u0w7kjs_\wheels\81\2c\23\3ff788bcc6063bf30116ad1a06e75d3ba9aad3f7bc4aba765b
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1


  Running command git clone -q https://github.com/rwalk/gsdmm.git 'C:\Users\akhil\AppData\Local\Temp\pip-req-build-tmqr6c0g'


In [71]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

data_folder = r"C:\Users\akhil\Desktop\NLP Assignment-2\comments1k"
data = []
for file_name in os.listdir(data_folder):
    with open(os.path.join(data_folder, file_name), "r", encoding="utf-8") as f:
        doc = f.read()
        data.append(doc)


# Convert the corpus into a matrix of word counts
count_vectorizer = CountVectorizer()
doc_term_matrix = count_vectorizer.fit_transform(data)
words = count_vectorizer.get_feature_names()

# Train the LDA topic model with 10 topics
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(doc_term_matrix)

# Print the top 8 words for each topic
for topic_id, topic in enumerate(lda_model.components_):
    top_words_indices = topic.argsort()[-8:][::-1]
    top_words = [words[i] for i in top_words_indices]
    print(f"Topic {topic_id}: {top_words}")

# Assign topics to specific documents
doc_0_9_vector = count_vectorizer.transform([data[0]])
doc_1_7_vector = count_vectorizer.transform([data[1]])
doc_0_9_topic_distribution = lda_model.transform(doc_0_9_vector)[0]
doc_1_7_topic_distribution = lda_model.transform(doc_1_7_vector)[0]

# Print the topics assigned to the documents
print(f"Topics assigned to document 0_9.txt: {doc_0_9_topic_distribution}")
print(f"Topics assigned to document 1_7.txt: {doc_1_7_topic_distribution}")


Topic 0: ['miike', 'yokai', 'children', 'takashi', 'war', 'kids', 'hero', 'spirits']
Topic 1: ['the', 'and', 'of', 'to', 'it', 'is', 'this', 'in']
Topic 2: ['alvin', 'morse', 'david', 'foxx', 'jamie', 'sanders', 'santa', 'bristol']
Topic 3: ['the', 'and', 'of', 'is', 'br', 'in', 'to', 'it']
Topic 4: ['brosnan', 'kinnear', 'pierce', 'greg', 'his', 'bond', 'davis', 'humor']
Topic 5: ['the', 'to', 'and', 'he', 'of', 'is', 'in', 'his']
Topic 6: ['the', 'to', 'and', 'of', 'br', 'in', 'is', 'that']
Topic 7: ['henry', 'betty', 'boop', 'fleischer', 'little', 'store', 'comic', 'short']
Topic 8: ['price', 'vincent', 'sammo', 'magician', 'wax', 'mad', 'house', 'rinaldi']
Topic 9: ['jodie', 'hong', 'kong', 'fantasy', 'zu', 'summer', 'average', '1983']
Topics assigned to document 0_9.txt: [7.69251963e-04 7.69452311e-04 7.69241679e-04 7.69435072e-04
 7.69240231e-04 7.69336834e-04 9.93076228e-01 7.69268797e-04
 7.69314281e-04 7.69230982e-04]
Topics assigned to document 1_7.txt: [3.05826439e-04 3.0591

In [70]:
print(len(data))

1000


In [72]:
import numpy as np

In [74]:
np.argsort(doc_1_7_topic_distribution)

array([2, 9, 0, 8, 7, 5, 6, 4, 1, 3], dtype=int64)