In [1]:
import pandas as pd
from gensim import corpora, models
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Load preprocessed data
df = pd.read_csv('output.csv')

# Convert preprocessed text back into lists of tokens
df['final_processed_text'] = df['final_processed_text'].apply(lambda x: x[1:-1].split(', '))

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['final_processed_text'])

# Convert document into the bag-of-words format = list of (token_id, token_count) tuples
corpus = [dictionary.doc2bow(text) for text in df['final_processed_text']]

# Determine the optimal number of topics
# You might want to explore different methods like the ones in this link:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda

# Build LDA model (replace <num_topics> with the optimal number)
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42)

# Evaluate with coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['final_processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Interpret Results
# 1. Print the keywords in the 10 topics
print("\nTop words in each topic:")
for topic_id in range(lda_model.num_topics):
    topk = lda_model.show_topic(topic_id, 10)
    topk_words = [ w for w, _ in topk ]
    print(f'Topic {topic_id}: {", ".join(topk_words)}')

# 2. Visualize results (Optional, requires 'pyLDAvis' library)
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# 3. (Optional) Further analysis:
#    - Dominant topic per document
#    - Topic distribution across documents
#    - Find most representative documents for each topic
#    (Code for these can be added based on your specific needs)


Coherence Score:  0.4631661259181712

Top words in each topic:
Topic 0: 'government', 'would', 'u', 'people', 'state', 'security', 'law', 'enforcement', 'mr', 'time'
Topic 1: 'x', 'window', 'file', 'system', 'program', 'version', 'use', 'db', 'chip', 'bit'
Topic 2: 'one', 'people', 'would', 'dont', 'think', 'know', 'say', 'like', 'get', 'time'
Topic 3: 'key', 'use', 'chip', 'file', 'x', 'information', 'anonymous', 'system', 'number', 'algorithm'
Topic 4: 'db', 'b', 'phone', 'would', 'one', 'key', 'turkey', 'know', 'get', 'new'
Topic 5: 'shall', 'income', 'patent', 'would', 'one', 'season', 'que', 'supreme', 'slave', 'captain'
Topic 6: 'mov', 'pt', 'sex', 'evil', 'x', 'intent', 'inhabitant', 'pp', 'cx', 'b'
Topic 7: 'jew', 'x', 'db', 'character', 'int', 'char', 'election', 'font', 'bishop', 'verdict'
Topic 8: 'q', 'new', 'president', 'b', 'year', 'mr', 'st', 'v', 'gm', 'team'
Topic 9: 'would', 'privacy', 'also', 'president', 'one', 'de', 'secure', 'faith', 'u', 'secret'


ModuleNotFoundError: No module named 'pyLDAvis'

In [2]:
pip install pyLDAvis

Collecting pyLDAvis
  Obtaining dependency information for pyLDAvis from https://files.pythonhosted.org/packages/6b/5a/66364c6799f2362bfb9b7100bc1ce6ffcdfe7f17e8d2e85a591bfe427643/pyLDAvis-3.4.1-py3-none-any.whl.metadata
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Obtaining dependency information for funcy from https://files.pythonhosted.org/packages/d5/08/c2409cb01d5368dcfedcbaffa7d044cc8957d57a9d0855244a5eb4709d30/funcy-2.0-py2.py3-none-any.whl.metadata
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting FuzzyTM>=0.4.0 (from gensim->pyLDAvis)
  Obtaining dependency information for FuzzyTM>=0.4.0 from https://files.pythonhosted.org/packages/2d/30/074bac7a25866a2807c1005c7852c0139ac22ba837871fc01f16df29b9dc/FuzzyTM-2.0.9-py3-none-any.whl.metadata
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Obtaining dependency information for pyfume

In [3]:
conda install -c conda-forge pyldavis


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from gensim import corpora, models
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Load preprocessed data
df = pd.read_csv('output.csv')

# Convert preprocessed text back into lists of tokens
df['final_processed_text'] = df['final_processed_text'].apply(lambda x: x[1:-1].split(', '))

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['final_processed_text'])

# Convert document into the bag-of-words format = list of (token_id, token_count) tuples
corpus = [dictionary.doc2bow(text) for text in df['final_processed_text']]

# Determine the optimal number of topics
# You might want to explore different methods like the ones in this link:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda

# Build LDA model (replace <num_topics> with the optimal number)
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42)

# Evaluate with coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['final_processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Interpret Results
# 1. Print the keywords in the 10 topics
print("\nTop words in each topic:")
for topic_id in range(lda_model.num_topics):
    topk = lda_model.show_topic(topic_id, 10)
    topk_words = [ w for w, _ in topk ]
    print(f'Topic {topic_id}: {", ".join(topk_words)}')

# 2. Visualize results (Optional, requires 'pyLDAvis' library)
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# 3. (Optional) Further analysis:
#    - Dominant topic per document
#    - Topic distribution across documents
#    - Find most representative documents for each topic
#    (Code for these can be added based on your specific needs)


Coherence Score:  0.4631661259181712

Top words in each topic:
Topic 0: 'government', 'would', 'u', 'people', 'state', 'security', 'law', 'enforcement', 'mr', 'time'
Topic 1: 'x', 'window', 'file', 'system', 'program', 'version', 'use', 'db', 'chip', 'bit'
Topic 2: 'one', 'people', 'would', 'dont', 'think', 'know', 'say', 'like', 'get', 'time'
Topic 3: 'key', 'use', 'chip', 'file', 'x', 'information', 'anonymous', 'system', 'number', 'algorithm'
Topic 4: 'db', 'b', 'phone', 'would', 'one', 'key', 'turkey', 'know', 'get', 'new'
Topic 5: 'shall', 'income', 'patent', 'would', 'one', 'season', 'que', 'supreme', 'slave', 'captain'
Topic 6: 'mov', 'pt', 'sex', 'evil', 'x', 'intent', 'inhabitant', 'pp', 'cx', 'b'
Topic 7: 'jew', 'x', 'db', 'character', 'int', 'char', 'election', 'font', 'bishop', 'verdict'
Topic 8: 'q', 'new', 'president', 'b', 'year', 'mr', 'st', 'v', 'gm', 'team'
Topic 9: 'would', 'privacy', 'also', 'president', 'one', 'de', 'secure', 'faith', 'u', 'secret'


In [1]:
conda install -c conda-forge pyldavis


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from gensim import corpora, models
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Load preprocessed data
df = pd.read_csv('output.csv')

# Convert preprocessed text back into lists of tokens
df['final_processed_text'] = df['final_processed_text'].apply(lambda x: x[1:-1].split(', '))

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['final_processed_text'])

# Convert document into the bag-of-words format = list of (token_id, token_count) tuples
corpus = [dictionary.doc2bow(text) for text in df['final_processed_text']]

# Determine the optimal number of topics
# You might want to explore different methods like the ones in this link:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda

# Build LDA model (replace <num_topics> with the optimal number)
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42)

# Evaluate with coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['final_processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Interpret Results
# 1. Print the keywords in the 10 topics
print("\nTop words in each topic:")
for topic_id in range(lda_model.num_topics):
    topk = lda_model.show_topic(topic_id, 10)
    topk_words = [ w for w, _ in topk ]
    print(f'Topic {topic_id}: {", ".join(topk_words)}')

# 2. Visualize results (Optional, requires 'pyLDAvis' library)
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# 3. (Optional) Further analysis:
#    - Dominant topic per document
#    - Topic distribution across documents
#    - Find most representative documents for each topic
#    (Code for these can be added based on your specific needs)


Coherence Score:  0.4631661259181712

Top words in each topic:
Topic 0: 'government', 'would', 'u', 'people', 'state', 'security', 'law', 'enforcement', 'mr', 'time'
Topic 1: 'x', 'window', 'file', 'system', 'program', 'version', 'use', 'db', 'chip', 'bit'
Topic 2: 'one', 'people', 'would', 'dont', 'think', 'know', 'say', 'like', 'get', 'time'
Topic 3: 'key', 'use', 'chip', 'file', 'x', 'information', 'anonymous', 'system', 'number', 'algorithm'
Topic 4: 'db', 'b', 'phone', 'would', 'one', 'key', 'turkey', 'know', 'get', 'new'
Topic 5: 'shall', 'income', 'patent', 'would', 'one', 'season', 'que', 'supreme', 'slave', 'captain'
Topic 6: 'mov', 'pt', 'sex', 'evil', 'x', 'intent', 'inhabitant', 'pp', 'cx', 'b'
Topic 7: 'jew', 'x', 'db', 'character', 'int', 'char', 'election', 'font', 'bishop', 'verdict'
Topic 8: 'q', 'new', 'president', 'b', 'year', 'mr', 'st', 'v', 'gm', 'team'
Topic 9: 'would', 'privacy', 'also', 'president', 'one', 'de', 'secure', 'faith', 'u', 'secret'


In [5]:
import pandas as pd
from gensim import corpora, models
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Load your preprocessed data
df = pd.read_csv('output.csv')

# Convert preprocessed text back into lists of tokens
df['final_processed_text'] = df['final_processed_text'].apply(lambda x: x[1:-1].split(', '))

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['final_processed_text'])

# Convert document into the bag-of-words format = list of (token_id, token_count) tuples
corpus = [dictionary.doc2bow(text) for text in df['final_processed_text']]

# Determine the optimal number of topics (e.g., using coherence scores or other methods)
# ... (Your code to find the optimal number of topics)

# Build LDA model (replace <num_topics> with the optimal number)
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42)

# Evaluate with coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['final_processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Interpret Results

# 1. Print the keywords in the 10 topics
print("\nTop words in each topic:")
for topic_id in range(lda_model.num_topics):
    topk = lda_model.show_topic(topic_id, 10)
    topk_words = [ w for w, _ in topk ]
    print(f'Topic {topic_id}: {", ".join(topk_words)}')

# 2. Visualize results (Optional, requires 'pyLDAvis' library)
# ... (Your code for visualization if needed)

# 3. Further Analysis

# 3.1 Dominant Topic per Document
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus):
    topic_dist = lda_model[corp]
    dominant_topic = sorted(topic_dist, key=lambda x: x[1], reverse=True)[0][0]
    dominant_topics.append(dominant_topic)
    topic_percentages.append(topic_dist)

df['Dominant_Topic'] = dominant_topics
df['Topic_Perc_Contrib'] = topic_percentages

print(df[['final_processed_text', 'Dominant_Topic', 'Topic_Perc_Contrib']])

# 3.2 Topic Distribution Across Documents
topic_counts = df['Dominant_Topic'].value_counts()
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
print(topic_contribution)

# 3.3 Most Representative Documents for Each Topic
sent_topics_sorteddf = pd.DataFrame()
sent_topics_outdf_grpd = df.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, grp.sort_values(['Topic_Perc_Contrib'], ascending=[0]).head(5)], axis=0)

# Check and modify columns here
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Print the column names so you can adjust `new_columns` accordingly
print("Current column names:", sent_topics_sorteddf.columns)

# Assuming the columns after grouping are ['Dominant_Topic', 'Topic_Perc_Contrib', 'final_processed_text']
# Modify this if it's different in your case
new_columns = ['Topic_Num', "Topic_Perc_Contrib", "Text"] 

# Ensure same number of new columns and current columns
if len(sent_topics_sorteddf.columns) != len(new_columns):
    raise ValueError("Number of new column names does not match the number of existing columns")

sent_topics_sorteddf.columns = new_columns
# Optionally, if you want to add keywords
# sent_topics_sorteddf["Keywords"] = sent_topics_sorteddf["Topic_Num"].apply(lambda x: ", ".join([w for w, _ in lda_model.show_topic(x, 10)]))

print(sent_topics_sorteddf)


Coherence Score:  0.4631661259181712

Top words in each topic:
Topic 0: 'government', 'would', 'u', 'people', 'state', 'security', 'law', 'enforcement', 'mr', 'time'
Topic 1: 'x', 'window', 'file', 'system', 'program', 'version', 'use', 'db', 'chip', 'bit'
Topic 2: 'one', 'people', 'would', 'dont', 'think', 'know', 'say', 'like', 'get', 'time'
Topic 3: 'key', 'use', 'chip', 'file', 'x', 'information', 'anonymous', 'system', 'number', 'algorithm'
Topic 4: 'db', 'b', 'phone', 'would', 'one', 'key', 'turkey', 'know', 'get', 'new'
Topic 5: 'shall', 'income', 'patent', 'would', 'one', 'season', 'que', 'supreme', 'slave', 'captain'
Topic 6: 'mov', 'pt', 'sex', 'evil', 'x', 'intent', 'inhabitant', 'pp', 'cx', 'b'
Topic 7: 'jew', 'x', 'db', 'character', 'int', 'char', 'election', 'font', 'bishop', 'verdict'
Topic 8: 'q', 'new', 'president', 'b', 'year', 'mr', 'st', 'v', 'gm', 'team'
Topic 9: 'would', 'privacy', 'also', 'president', 'one', 'de', 'secure', 'faith', 'u', 'secret'
               

# Topic 0 seems to be related around politics and law, where the weight of terms like "government" and "enforcement" are particularly high, indicating their significance in this topic.

# Topic 1 seems to be related around politics and country, where the weight of terms like "jew" and "london" are particularly high, indicating their significance in this topic.

# Topic 2 seems to be related around friendship, where the weight of terms like "partner" and "supreme" are particularly high, indicating their significance in this topic.

# Topic 3 seems to be related around military, where the weight of terms like "soldier" and "brave" are particularly high, indicating their significance in this topic.