In [None]:
import numpy as np
import pandas as pd
import re
import os
from pprint import pprint
import math

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import TfidfModel

import spacy
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from pylab import rcParams    
rcParams['figure.dpi']=300

In [None]:
os.chdir(r"C:\Users\fanyu\Desktop\Jupiter")
df = pd.read_csv("WES.csv",encoding='latin1')

In [None]:
data = df.content.values.tolist()

In [None]:
data = [re.sub(r'\([^)]*\)', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
data = [re.sub(r'\b(?:America|United States|U\.S\.|USA)\b', 'US', sent, flags=re.IGNORECASE) for sent in data]
data = [re.sub(r'\bUnited Nations\b', 'UN', sent, flags=re.IGNORECASE) for sent in data]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc = True))
data_words = list(sent_to_words(data))      

In [None]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)

print(bigram_mod[data_words[0]])

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])


In [None]:
bigrams = [bigram for sublist in data_lemmatized for bigram in list(ngrams(sublist, 2))]

bigram_counts = Counter(bigrams)

top_20_bigrams = bigram_counts.most_common(20)

print("Top 20 Bigrams:")
for bigram, count in top_20_bigrams:
    print(bigram, "-", count)

bigram, count = zip(*top_20_bigrams)

plt.barh(range(len(bigram)), count, color='skyblue')
plt.yticks(range(len(bigram)), bigram)
plt.xlabel('Count')
plt.ylabel('Bigram')
plt.title("Top 20 Bigrams in TG and NYT")
plt.gca().invert_yaxis()  
plt.savefig('C:/Users/fanyu/Desktop/Jupiter/top_20_bigrams_WES.png', bbox_inches='tight')
plt.show()

In [None]:
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus,id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
corpus[i] = new_bow

In [None]:
model_list = []
perplexity_values = []
coherence_values = []

for num_topics in range (2,21,1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics= num_topics,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)
    model_list.append(lda_model)
    log_perplexity = lda_model.log_perplexity(corpus)
    perplexity = math.exp(log_perplexity) 
    perplexity_values.append(perplexity)

    coherencemodel = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence = coherencemodel.get_coherence()
    coherence_values.append(round(coherence, 3))
    print(f'Num Topics: {num_topics} - Perplexity: {perplexity}, Coherence Score: {round(coherence, 3)}')

print('\nPerplexity Values: ', perplexity_values)
print('Coherence Values: ', coherence_values)

In [None]:
num_topics_range = list(range(2, 21))

# Plot perplexity values
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(num_topics_range, perplexity_values, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity')
plt.title('Perplexity by Number of Topics')
plt.xticks(num_topics_range) 

# Plot coherence values
plt.subplot(1, 2, 2)
plt.plot(num_topics_range, coherence_values, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Coherence Score by Number of Topics')
plt.xticks(num_topics_range) 

plt.tight_layout()
plt.savefig('C:/Users/fanyu/Desktop/Jupiter/Perplexity_Coherence (WES).png', bbox_inches='tight')
plt.show()

In [None]:
best_num_topics = 6  
best_lda_model = model_list[best_num_topics - 2] 

num_words = 10
topics = best_lda_model.print_topics(num_words=num_words)
for topic in topics:
    print(f'Topic {topic[0]}: {topic[1]}')