In [1]:
import spacy
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
import zipfile
import pickle
from keras.models import load_model
import numpy as np
import re
from nltk.corpus import stopwords
import nltk

# nltk.download('stopwords')

pattern = re.compile('.*-(MONEY|QUANTITY|PERCENT|ORDINAL|DATE|TIME|CARDINAL)|(Other)')
pattern_symbols = re.compile('^[\.\\\/\[\]\(\),\-\'\"\?\!\“\”\’@:;–]+$')

In [1]:
# import sys
# !{sys.executable} -m pip install pyLDAvis

In [3]:
def split_text(text):
    sentences = nltk.sent_tokenize(text)
    return [nltk.word_tokenize(sentence) for sentence in sentences]

In [4]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    stop_words = stop_words.union({"reuters", "bbc"})
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and len(token)>3]
    filtered_tokens = [token for token in filtered_tokens if not pattern_symbols.match(token.lower())]
    return filtered_tokens

In [5]:
def process_file(file, filename, texts):
    content = file.read(filename)
    if type(content) == bytes:
        text = content.decode('utf-8')
        texts.append(text)

    if len(content.strip()) == 0:
        print("No text was found")
        return

In [6]:
def preprocess_text(text, word2idx, max_len):
    # Split the text into tokens
    sentences = split_text(text)

    # Convert the tokens to integer IDs using the word2id dictionary
    ids = []
    endpad_idx = word2idx['ENDPAD']
    for tokens in sentences:
        array = []
        for token in tokens:
            if token in word2idx.keys():
                array.append(word2idx[token])
            else:
                array.append(0)

        while len(array) < max_len:
            array.append(endpad_idx)
        ids.append(array)

    return ids, sentences

In [7]:
def predict_text_2(text, model, tags):
    ents = []
    labels = []
    for i in range(len(text)):
        p = model.predict(np.array([text[0][i]]), verbose=0)
        p = np.argmax(p, axis=-1)
        for idx, pred in enumerate(p[0][0:len(text[1][i])]):
            if len(word := text[1][i][idx]) > 3:
                ents.append(word.lower())
                if not pattern.match(tags[pred]):
                    # append the same word once more to increase its statistics and weight artificially
                    ents.append(word.lower())
#                     ents.append(word.lower())
                    labels.append(tags[pred])
    ents = remove_stopwords(ents)
    return ents, labels

In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

def get_labeled_words(model, texts, word2idx):
    proc_labels = []
    proc_ents = []
    for text in texts:
        processed = preprocess_text(text, word2idx, model.layers[0].output_shape[0][1])
        ents, labels = predict_text_2(processed, model, tags)
        proc_ents.append(remove_stopwords(ents))
        proc_labels.append(labels)
    lemmatized_text = [lemmatize_tokens(doc) for doc in proc_ents]
    return lemmatized_text, proc_labels

def get_labeled_words_2(texts):
    proc_labels = []
    proc_ents = []
    for text in texts:
        processed = split_text(text)
        ents = [word for sentence in processed for word in sentence]

        proc_ents.append(remove_stopwords(ents))
    lemmatized_text = [lemmatize_tokens(doc) for doc in proc_ents]
    return lemmatized_text

In [None]:
texts = []
# with zipfile.ZipFile("data/articles_2021-11-05_1000.zip", "r") as f:
# with zipfile.ZipFile("data/articles_2023-01-07_2000.zip", "r") as f:
# with zipfile.ZipFile("data/articles_2023-02-04_500.zip", "r") as f:
with zipfile.ZipFile("data/articles_2023-02-09_1000.zip", "r") as f:
    total_f = len(f.namelist())
    counter = 1
    for filename in f.namelist():
        counter += 1
        process_file(f, filename, texts)
    f.close()

model = load_model('models/my_model2/model.h5')
# ----------------------------#
with open('models/my_model2/tags.pickle', 'rb') as handle:
    tags = pickle.load(handle)

with open('models/my_model2/words.pickle', 'rb') as handle:
    word2idx = pickle.load(handle)


proc_ents, proc_labels = get_labeled_words(model, texts, word2idx)

# proc_ents = get_labeled_words_2(texts)

print("Done")

In [None]:
# Create a dictionary mapping named entities to integer ids
dictionary = Dictionary(proc_ents)

# Create a document-term matrix where each document is a text and each term is a named entity
corpus = [dictionary.doc2bow(text) for text in proc_ents]

# Train the LDA model on the corpus
#lda_model = LdaModel(corpus, num_topics=6)

num_topics = 6

# lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
#                                         id2word=dictionary,
#                                         num_topics=num_topics,
#                                         random_state=100,
#                                         workers=7,
#                                         chunksize=10,
#                                         passes=20,
#                                         alpha='symmetric',
#                                         iterations=200,
#                                         per_word_topics=False)

# # Infer the topic distribution for each text
# # text_topics = [lda_model[c] for c in corpus]

# for i in range(num_topics):
#     topic_words = lda_model.show_topic(i)
#     tmp = [(word, prob) for word, prob in topic_words]
#     print("Topic %d: \n%s\n" % (i, tmp))

# # print(text_topics[0:10])

In [None]:
def progress_bar(iteration, total):
    total_len = 100
    percent_part = ("{0:.2f}").format(100 * (iteration / total))
    filled = int(total_len * iteration / total)
    bar = '█' * filled + '-' * (total_len - filled)
    print(f'\r Progress: [{bar}] {percent_part}%', end='')
    if iteration == total:
        print()

In [None]:
def custom_grid_search(texts, corpus, id2word, hyperparams_list, coherence_cv_scores, coherence_umass_scores, perplexity_scores, dir_path, with_training):
    counter = 1

    if with_training:
        for elem in hyperparams_list:
            num_topics, alpha = elem.values()
            progress_bar(counter, len(hyperparams_list))
            counter += 1
            lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=id2word,
                                                        num_topics=num_topics,
                                                        workers=19,
                                                        random_state=100,
                                                        chunksize=100,
                                                        passes=10,
                                                        alpha=alpha,
                                                        iterations=200,
                                                        per_word_topics=False)

            joblib.dump(lda_model, dir_path + str(num_topics) + 'topics_' + alpha + '_new.jl')

    start_time = time.time()
    counter = 1
    for elem in hyperparams_list:
        num_topics, alpha = elem.values()
        progress_bar(counter, len(hyperparams_list))
        counter += 1

        lda_model = joblib.load(dir_path + str(num_topics) + 'topics_' + alpha + '_new.jl')

        # Coherence model to get coherence score, based on currently used corpus and dictionary
        coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=texts,
                                                           dictionary=id2word, coherence='c_v')

        coherence_cv_scores.append({"num_topics": num_topics,
                                 "C_V coherence": coherence_model_lda.get_coherence()})

        # Coherence model to get coherence score, based on currently used corpus and dictionary
        coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')

        coherence_umass_scores.append({"num_topics": num_topics,
                                 "U_Mass coherence": coherence_model_lda.get_coherence()})

        perplexity_scores.append({"num_topics": num_topics,
                                  "perplexity": lda_model.log_perplexity(corpus)})

    final_time = time.time() - start_time
    print("Total time spent: " + str(final_time) + " seconds")

    alpha = hyperparams_list[0]["alpha"]
#     joblib.dump(coherence_cv_scores, dir_path + "coherence_cv_scores_" + alpha + "_alpha.jl")
#     joblib.dump(coherence_umass_scores, dir_path + "coherence_umass_scores_" + alpha + "_alpha.jl")
#     joblib.dump(perplexity_scores, dir_path + "perplexity_scores_" + alpha + "_alpha.jl")

In [None]:
import joblib
import time

hyperparams_list = [{"num_topics": 4, "alpha": "symmetric"},
                    {"num_topics": 6, "alpha": "symmetric"},
                    {"num_topics": 8, "alpha": "symmetric"},
                    {"num_topics": 10, "alpha": "symmetric"},
                    {"num_topics": 12, "alpha": "symmetric"},
                    {"num_topics": 14, "alpha": "symmetric"}]

hyperparams_list_2 = [{"num_topics": 4, "alpha": "asymmetric"},
                      {"num_topics": 6, "alpha": "asymmetric"},
                      {"num_topics": 8, "alpha": "asymmetric"},
                      {"num_topics": 10, "alpha": "asymmetric"},
                      {"num_topics": 12, "alpha": "asymmetric"},
                      {"num_topics": 14, "alpha": "asymmetric"}]

coherence_cv_scores = []
coherence_umass_scores = []
perplexity_scores = []
dir_path = "data/hyperparameter_tuning/"

custom_grid_search(proc_ents, corpus, dictionary, hyperparams_list,
                   coherence_cv_scores, coherence_umass_scores, perplexity_scores, dir_path, True)

# custom_grid_search(proc_ents, corpus, dictionary, hyperparams_list_2,
#                    coherence_cv_scores, coherence_umass_scores, perplexity_scores, dir_path, True)

print("Coherence_cv:\n" + str(coherence_cv_scores))
print("Coherence_umass:\n" + str(coherence_umass_scores))
print("Perplexity:\n" + str(perplexity_scores))

In [None]:
print(dictionary)

In [None]:
# from gensim.models import LdaModel
# from gensim.models import CoherenceModel
# import itertools

# # Define the range of hyperparameters to search over
# num_topics = [4, 6, 8, 10]
# chunksize = [100, 200, 300]
# passes = [10, 20, 30, 40]

# # Create a list of all possible hyperparameter combinations
# hyperparameters = list(itertools.product(num_topics, chunksize, passes))

# # Initialize variables to store the best model and coherence score
# best_model = None
# best_coherence_score = -float('inf')

# counter = 0
# progress_bar(counter, len(hyperparameters))

# # Loop through each hyperparameter combination and train an LDA model
# for params in hyperparameters:
    
#     num_topics, chunksize, passes = params
#     lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
#                          id2word=dictionary,
#                          workers=19,
#                          num_topics=num_topics,
#                          chunksize=chunksize,
#                          passes=passes,
#                          alpha='symmetric',
#                          per_word_topics=False)
    
#     # Compute the coherence score for the current model
#     coherence_model = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
#     coherence_score = coherence_model.get_coherence()
    
#     # If the coherence score is better than the best seen so far, update the best model and score
#     if coherence_score > best_coherence_score:
#         best_model = lda_model
#         best_coherence_score = coherence_score
    
#     counter += 1
#     progress_bar(counter, len(hyperparameters))
        
# joblib.dump(lda_model, "data/hyperparameter_tuning/" + str(best_model.num_topics) + 'topics.jl')
        
# # Print the best hyperparameters and coherence score
# print("Best hyperparameters: num_topics={}, chunksize={}, passes={}".format(best_model.num_topics, best_model.chunksize, best_model.passes))
# print("Best coherence score: {}".format(best_coherence_score))

In [None]:
import matplotlib.pyplot as plt

def visualize_results(data):
    keys = list(data[0].keys())
    x = [d[keys[0]] for d in data]
    y = [d[keys[1]] for d in data]

    plt.bar(x, y, align='center')
    plt.xlabel('Number of Topics')
    plt.ylabel(keys[1])
    plt.title('Histogram of {}'.format(keys[1]))

    plt.show()

In [None]:
visualize_results(coherence_cv_scores)
visualize_results(coherence_umass_scores)
visualize_results(perplexity_scores)

In [None]:
lda_model = joblib.load(dir_path + str(4) + 'topics_symmetric_new.jl')
for i in range(4):
    topic_words = lda_model.show_topic(i)
    tmp = [(word, prob) for word, prob in topic_words]
    print("Topic %d: \n%s\n" % (i, tmp))

In [None]:
import pyLDAvis.gensim_models

# Prepare the data for visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Show the visualization
pyLDAvis.display(vis)

In [None]:
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.colors as mcolors
from bokeh.plotting import figure, show

def t_SNE_clustering(lda_model, corpus):
    if (len(corpus) < 2):
        print("Required 2 or more texts in the corpus")
        return
    topic_weights = []
    for i, row_list in enumerate(lda_model[corpus]):
        row = row_list[0] if lda_model.per_word_topics else row_list
        topic_weights.append([w for i, w in row])


    # Array of topic weights
    arr = pd.DataFrame(topic_weights).fillna(0).values

    # Keep the well separated points (optional)
    arr = arr[np.amax(arr, axis=1) > 0.35]

    # Dominant topic number in each doc
    topic_num = np.argmax(arr, axis=1)

    print("Choose perplexity (neigbors=3*perplexity):")
    perpl = int(input())

    if perpl < 0:
        perpl = 30

    # tSNE Dimension Reduction
    tsne_model = TSNE(n_components=3, verbose=1, random_state=0, angle=.99, init='pca', n_jobs=7, perplexity=perpl)
    tsne_lda = tsne_model.fit_transform(arr)

    # Plot the Topic Clusters using Bokeh
    n_topics = lda_model.num_topics
    mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
    plt.figure(figsize=(12, 8))
    plt.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1], color=mycolors[topic_num])
    plt.show()

In [None]:
t_SNE_clustering(lda_model, corpus)