In [2]:
import nltk
import gensim
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.chunk import ne_chunk
import zipfile
import numpy as np

nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\kreti\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\kreti\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
def process_file(file, filename, texts):
    content = file.read(filename)
    if type(content) == bytes:
        text = content.decode('utf-8')
        texts.append(text)

    if len(content.strip()) == 0:
        print("No text was found")
        return

In [4]:
def split_text(text):
    sentences = nltk.sent_tokenize(text)
    return [nltk.word_tokenize(sentence) for sentence in sentences]

In [5]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    stop_words = stop_words.union({"reuters", "bbc"})
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and len(token)>3]
    filtered_tokens = [token for token in filtered_tokens if not pattern_symbols.match(token.lower())]
    return filtered_tokens

In [28]:
corpus = []
with zipfile.ZipFile("data/articles_2021-11-05_1000.zip", "r") as f:
    total_f = len(f.namelist())
    counter = 1
    for filename in f.namelist():
        counter += 1
        process_file(f, filename, corpus)
    f.close()

# Apply NER to identify named entities and their types
named_entities = []
for doc in corpus:
    tokens = word_tokenize(doc)
    tagged = nltk.pos_tag(tokens)
    entities = ne_chunk(tagged)
    for subtree in entities.subtrees():
        if len(subtree.leaves()) <= 6:
            named_entity = " ".join([word for word, tag in subtree.leaves()])
            named_entities.append(named_entity)

# Group similar named entities together
named_entity_clusters = {}
for entity in named_entities:
    entity_type = nltk.ne_chunk(nltk.pos_tag(word_tokenize(entity)))[0]
    if len(entity_type) == 1 and type(entity_type) != tuple:
        entity_type = entity_type.label()
    else:
        continue
    if entity_type not in named_entity_clusters:
        named_entity_clusters[entity_type] = []
    named_entity_clusters[entity_type].append(entity)

In [29]:
# Remove named entities from documents
stop_words = set(stopwords.words('english'))
documents = []
for doc in corpus:
    words = simple_preprocess(doc)
#     ext_words = []
#     for word in words:
#         if word not in stop_words:
#             ext_words.append(word)
#             if word in named_entities:
#                 ext_words.append(word)
#                 ext_words.append(word)
    words = [word for word in words if word not in stop_words]
    print(words)
    documents.append(words)

['istanbul', 'oct', 'reuters', 'turkish', 'president', 'tayyip', 'erdogan', 'said', 'saturday', 'told', 'foreign', 'ministry', 'expel', 'ambassadors', 'united', 'states', 'nine', 'western', 'countries', 'demanding', 'release', 'philanthropist', 'osman', 'kavala', 'seven', 'ambassadors', 'represent', 'turkey', 'nato', 'allies', 'expulsions', 'carried', 'would', 'open', 'deepest', 'rift', 'west', 'erdogan', 'years', 'power', 'kavala', 'contributor', 'numerous', 'civil', 'society', 'groups', 'prison', 'four', 'years', 'charged', 'financing', 'nationwide', 'protests', 'involvement', 'failed', 'coup', 'remained', 'detention', 'latest', 'trial', 'continues', 'denies', 'charges', 'joint', 'statement', 'oct', 'ambassadors', 'canada', 'denmark', 'france', 'germany', 'netherlands', 'norway', 'sweden', 'finland', 'new', 'zealand', 'united', 'states', 'called', 'speedy', 'resolution', 'kavala', 'case', 'urgent', 'release', 'summoned', 'foreign', 'ministry', 'called', 'statement', 'irresponsible', 

['tunis', 'oct', 'reuters', 'united', 'nations', 'libya', 'mission', 'said', 'saturday', 'country', 'parliament', 'amend', 'election', 'law', 'hold', 'presidential', 'parliamentary', 'elections', 'dec', 'originally', 'envisioned', 'peace', 'plan', 'house', 'representatives', 'hor', 'libya', 'eastern', 'based', 'parliament', 'issued', 'separate', 'laws', 'presidential', 'election', 'dec', 'parliamentary', 'election', 'would', 'held', 'later', 'unspecified', 'date', 'wrangling', 'elections', 'planned', 'part', 'wider', 'peace', 'push', 'also', 'brought', 'transitional', 'unity', 'government', 'office', 'threatened', 'derail', 'libya', 'efforts', 'end', 'decade', 'chaos', 'violence', 'critics', 'hor', 'well', 'rival', 'libyan', 'political', 'institutions', 'denounced', 'election', 'laws', 'intended', 'preserve', 'power', 'chamber', 'leadership', 'others', 'accused', 'critics', 'seeking', 'delay', 'divert', 'election', 'respecting', 'principle', 'simultaneous', 'presidential', 'parliamenta

In [30]:
# Create a dictionary and bag-of-words representation of the corpus
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [33]:
import copy

corpus_extended = copy.deepcopy(corpus)
corpus_len = len(corpus)
print(int(corpus_len*0.02))
# Add named entities to dictionary and corpus
for entity_type, entities in named_entity_clusters.items():
        processed_ents = [entity.lower() for entity in set(entities)]
        new_doc = [dictionary.doc2bow(simple_preprocess(entity)) for entity in processed_ents]
        new_doc = [item for entity in new_doc for item in entity]
        for i in range(int(corpus_len*0.02)):
            corpus_extended.append(new_doc)

print(len(corpus), len(corpus_extended))


20
1000 1120


In [13]:
def calculate_tus_score(coherence_model, lda_model):
    # Get the coherence scores for all topics in the model
    topic_coherence = coherence_model.get_coherence_per_topic()

    # Calculate the TUS score for each topic in the model
    tus_scores = []
    for i in range(lda_model.num_topics):
        topic_i = lda_model.show_topic(i)
        coherence_i = topic_coherence[i]
        coherence_sum = sum([topic_coherence[j] for j in range(len(lda_model.show_topics())) if j != i])
        tus_i = coherence_i - (1/(lda_model.num_topics-1)) * coherence_sum
        tus_scores.append(tus_i)
        
    return np.mean(np.abs(tus_scores))

In [8]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mxkretinin[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [15]:
# Define the hyperparameters to sweep over
sweep_config = {
    'name': 'NER_LDA',
    'method': 'grid',
    'metric': {
        'name': 'tus',
        'goal': 'minimize'
    },
    'parameters': {
        'num_topics': {'values': [6, 8, 10, 12]},
        'alpha': {'values': ['symmetric', 'asymmetric']}
    }
}

# used_corpus = corpus
used_corpus = corpus_extended

# Define the training function
def train():
    
    wandb.init()

    # Set the hyperparameters based on the wandb configuration
    num_topics = wandb.config.num_topics
#     passes = wandb.config.passes
#     iterations = wandb.config.iterations
    alpha = wandb.config.alpha
#     corpus_data = wandb.config.corpus_data
    
    # Create the LDA model
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=used_corpus,
                                id2word=dictionary,
                                num_topics=num_topics,
                                workers=19,
                                random_state=100,
                                chunksize=100,
                                passes=10,
                                iterations=200,
                                alpha=alpha,
                                per_word_topics=False
                                )

    # Compute coherence and perplexity
    coherence_model = CoherenceModel(model=lda_model,
                                     corpus=used_corpus,
                                     coherence='u_mass')
    coherence = coherence_model.get_coherence()

    perplexity = lda_model.log_perplexity(corpus)
    
    tus_score = calculate_tus_score(coherence_model, lda_model)
    
    # Log the results to wandb
    wandb.log({
        'coherence(U_mass)': coherence,
        'perplexity': perplexity,
        'tus': tus_score
    })

#     wandb.finish()

# Run the agent to sweep over the hyperparameters
sweep_id = wandb.sweep(sweep_config)

wandb.agent(sweep_id, function=train)

Create sweep with ID: 4z0u80mj
Sweep URL: https://wandb.ai/xkretinin/uncategorized/sweeps/4z0u80mj


[34m[1mwandb[0m: Agent Starting Run: t0i48cyn with config:
[34m[1mwandb[0m: 	alpha: symmetric
[34m[1mwandb[0m: 	num_topics: 6


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.187960…

0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.10543
perplexity,-8.37341
tus,0.19184


[34m[1mwandb[0m: Agent Starting Run: up9m0zc9 with config:
[34m[1mwandb[0m: 	alpha: symmetric
[34m[1mwandb[0m: 	num_topics: 8


0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.06102
perplexity,-8.37561
tus,0.18128


[34m[1mwandb[0m: Agent Starting Run: ejza40ks with config:
[34m[1mwandb[0m: 	alpha: symmetric
[34m[1mwandb[0m: 	num_topics: 10


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.194580…

0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.25739
perplexity,-8.37812
tus,0.39318


[34m[1mwandb[0m: Agent Starting Run: erxzdk29 with config:
[34m[1mwandb[0m: 	alpha: symmetric
[34m[1mwandb[0m: 	num_topics: 12


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.194580…

0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.2494
perplexity,-8.41
tus,0.37426


[34m[1mwandb[0m: Agent Starting Run: 31hlav5f with config:
[34m[1mwandb[0m: 	alpha: asymmetric
[34m[1mwandb[0m: 	num_topics: 6


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.194580…

0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.128
perplexity,-8.37108
tus,0.20571


[34m[1mwandb[0m: Agent Starting Run: f3yl35s9 with config:
[34m[1mwandb[0m: 	alpha: asymmetric
[34m[1mwandb[0m: 	num_topics: 8


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.194580…

0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.06603
perplexity,-8.37921
tus,0.18853


[34m[1mwandb[0m: Agent Starting Run: 8tp4mza4 with config:
[34m[1mwandb[0m: 	alpha: asymmetric
[34m[1mwandb[0m: 	num_topics: 10


VBox(children=(Label(value='0.001 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.686379…

0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.15521
perplexity,-8.37863
tus,0.27895


[34m[1mwandb[0m: Agent Starting Run: s2m34476 with config:
[34m[1mwandb[0m: 	alpha: asymmetric
[34m[1mwandb[0m: 	num_topics: 12


VBox(children=(Label(value='0.005 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.880274…

0,1
coherence(U_mass),▁
perplexity,▁
tus,▁

0,1
coherence(U_mass),-1.22614
perplexity,-8.41087
tus,0.36969


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [None]:
# coherence_model = CoherenceModel(model=lda_model,
#                                      corpus=corpus,
#                                      coherence='u_mass')
# coherence = coherence_model.get_coherence()

# print(coherence)

In [35]:
import pyLDAvis.gensim_models

# used_corpus = corpus
used_corpus = corpus_extended

num_topics = 2 + len(named_entity_clusters)
print("num of topics: %s" % num_topics)

# Create the LDA model
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=used_corpus,
                            id2word=dictionary,
                            num_topics=6,
                            workers=19,
                            random_state=100,
                            chunksize=100,
                            passes=10,
                            iterations=200,
                            alpha='asymmetric',
                            per_word_topics=False
                            )


# Prepare the data for visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Show the visualization
pyLDAvis.display(vis)

num of topics: 8


  default_term_info = default_term_info.sort_values(


In [36]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_tus(model):
    """
    Calculate Topic Uniqueness Score (TUS) for each topic in a given LDA model.
    Returns an array of TUS scores.
    """
    num_topics = model.num_topics
    word_probs = np.zeros((num_topics, len(model.id2word)))
    for topic_id in range(num_topics):
        word_probs[topic_id, :] = np.array([p for _, p in model.get_topic_terms(topic_id, len(model.id2word))])
    
    similarities = cosine_similarity(word_probs)
    np.fill_diagonal(similarities, 0) # set diagonal to 0 so a topic is not compared with itself
    tus_scores = np.mean(similarities, axis=1)
    tus_mean = np.mean(tus_scores)
    return tus_scores, tus_mean

tus_scores, tus_mean = calculate_tus(lda_model)
print(tus_scores)
print(tus_mean)


[0.67107518 0.65594279 0.65734442 0.55405214 0.66274131 0.49684864]
0.6163340811980053


In [None]:

import matplotlib.pyplot as plt


num_topics = lda_model.num_topics

# Get the coherence model for the LDA model
coherence_model = CoherenceModel(model=lda_model, corpus=used_corpus, coherence='u_mass')

# Get the coherence scores for all topics in the model
topic_coherence = coherence_model.get_coherence_per_topic()

# Calculate the TUS score for each topic in the model
tus_scores = []
for i in range(lda_model.num_topics):
    topic_i = lda_model.show_topic(i)
    coherence_i = topic_coherence[i]
    coherence_sum = sum([topic_coherence[j] for j in range(len(lda_model.show_topics())) if j != i])
    tus_i = coherence_i - (1/(len(lda_model.show_topics())-1)) * coherence_sum
    tus_scores.append(tus_i)

# Print the TUS score for each topic in the model
for i, score in enumerate(tus_scores):
    print(f"Topic {i+1}: TUS score = {score:.4f}")
mean_tus_score = np.mean(np.abs(tus_scores))
print(f"Mean TUS score = {mean_tus_score:.4f}")

In [None]:
print(tus_scores)

In [27]:
print(named_entity_clusters['PERSON'])

['Tayyip Erdogan', 'Osman Kavala', 'Joe Biden', 'Trude Maaseide', 'Tayyip Erdogan', 'David Sassoli', 'Osman Kavala', 'Jeppe Kofod', 'Selahattin Demirtas', 'Pacific Ocean', 'Tsugaru Strait', 'Pacific Ocean', 'Osumi Strait', 'Joe Biden', 'Boris Johnson', 'David Frost', 'Maros Sefcovic', 'Thomas Byrne', 'Olivier Lepretre', 'Annick Girardin', 'Jeffrey Feltman', 'Abdalla Hamdok', 'Sovereign Council', 'Park Inn', 'Natalya Nikonorova', 'Ann Linde', 'Guillermo Lasso', 'Matteo Salvini', 'Richard Gere', 'Giuseppe Conte', 'Richard Gere', 'Richard Gere', 'Matteo Salvini', 'Vanessa Nakate', 'Al Gore', 'Alok Sharma', 'John Kerry', 'Joe Biden', 'Build Back Better', 'Joe Biden', 'Liszt Quitel', 'Chenald Augustin', 'Ariel Henry', 'Jovenel Moise', 'Pfizer Inc', 'Joe Biden', 'Moderna Inc', 'Albert Bourla', 'Lee', 'Stanford University School', 'Vamil Divan', 'Drug Administration', 'Medicines Patent Pool', 'Bhavna Patel', 'Bhavna Patel', 'Bindiya Patel', 'Nicholls NEW', 'Chris Floyd', 'Virgin Atlantic', 'D