# BERT+LDA

In [None]:
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import re
import nltk
from nltk.tokenize import word_tokenize
from language_detector import detect_language

import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
if sym_spell.word_count:
    pass
else:
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)


def f_base(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # normalization 1: xxxThis is a --> xxx. This is a (missing delimiter)
    s = re.sub(r'([a-z])([A-Z])', r'\1\. \2', s)  # before lower case
    # normalization 2: lower case
    s = s.lower()
    # normalization 3: "&gt", "&lt"
    s = re.sub(r'&gt|&lt', ' ', s)
    # normalization 4: letter repetition (if more than 2)
    s = re.sub(r'([a-z])\1{2,}', r'\1', s)
    # normalization 5: non-word repetition (if more than 1)
    s = re.sub(r'([\W+])\1{1,}', r'\1', s)
    # normalization 6: string * as delimiter
    s = re.sub(r'\*|\W\*|\*\W', '. ', s)
    # normalization 7: stuff in parenthesis, assumed to be less informal
    s = re.sub(r'\(.*?\)', '. ', s)
    # normalization 8: xxx[?!]. -- > xxx.
    s = re.sub(r'\W+?\.', '.', s)
    # normalization 9: [.?!] --> [.?!] xxx
    s = re.sub(r'(\.|\?|!)(\w)', r'\1 \2', s)
    # normalization 10: ' ing ', noise text
    s = re.sub(r' ing ', ' ', s)
    # normalization 11: noise text
    s = re.sub(r'product received for free[.| ]', ' ', s)
    # normalization 12: phrase repetition
    s = re.sub(r'(.{2,}?)\1{1,}', r'\1', s)

    return s.strip()


# language detection
def f_lan(s):
    """
    :param s: string to be processed
    :return: boolean (s is English)
    """

    # some reviews are actually english but biased toward french
    return detect_language(s) in {'English', 'French'}


###############################
#### word level preprocess ####
###############################

# filtering out punctuations and numbers
def f_punct(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with punct and number filter out
    """
    return [word for word in w_list if word.isalpha()]


# selecting nouns
def f_noun(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with only nouns selected
    """
    return [word for (word, pos) in nltk.pos_tag(w_list) if pos[:2] == 'NN']


# typo correction
def f_typo(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with typo fixed by symspell. words with no match up will be dropped
    """
    w_list_fixed = []
    for word in w_list:
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3)
        if suggestions:
            w_list_fixed.append(suggestions[0].term)
        else:
            pass
            # do word segmentation, deprecated for inefficiency
            # w_seg = sym_spell.word_segmentation(phrase=word)
            # w_list_fixed.extend(w_seg.corrected_string.split())
    return w_list_fixed


# stemming if doing word-wise
p_stemmer = PorterStemmer()


def f_stem(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with stemming
    """
    return [p_stemmer.stem(word) for word in w_list]


# filtering out stop words
# create English stop words list
en_stop = get_stop_words('en')
en_stop.append('game')
en_stop.append('play')
en_stop.append('player')
en_stop.append('time')


def f_stopw(w_list):
    """
    filtering out stop words
    """
    return [word for word in w_list if word not in en_stop]


def preprocess_sent(rw):
    """
    Get sentence level preprocessed data from raw review texts
    :param rw: review to be processed
    :return: sentence level pre-processed review
    """
    s = f_base(rw)
    if not f_lan(s):
        return None
    return s


def preprocess_word(s):
    """
    Get word level preprocessed data from preprocessed sentences
    including: remove punctuation, select noun, fix typo, stem, stop_words
    :param s: sentence to be processed
    :return: word level pre-processed review
    """
    if not s:
        return None
    w_list = word_tokenize(s)
    w_list = f_punct(w_list)
    w_list = f_noun(w_list)
    w_list = f_typo(w_list)
    w_list = f_stem(w_list)
    w_list = f_stopw(w_list)

    return w_list


In [2]:
import keras
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt


class Autoencoder:
    """
    Autoencoder for learning latent space representation
    architecture simplified for only one hidden layer
    """

    def __init__(self, latent_dim=32, activation='relu', epochs=200, batch_size=128):
        self.latent_dim = latent_dim
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.autoencoder = None
        self.encoder = None
        self.decoder = None
        self.his = None

    def _compile(self, input_dim):
        """
        compile the computational graph
        """
        input_vec = Input(shape=(input_dim,))
        encoded = Dense(self.latent_dim, activation=self.activation)(input_vec)
        decoded = Dense(input_dim, activation=self.activation)(encoded)
        self.autoencoder = Model(input_vec, decoded)
        self.encoder = Model(input_vec, encoded)
        encoded_input = Input(shape=(self.latent_dim,))
        decoder_layer = self.autoencoder.layers[-1]
        self.decoder = Model(encoded_input, self.autoencoder.layers[-1](encoded_input))
        self.autoencoder.compile(optimizer='adam', loss=keras.losses.mean_squared_error)

    def fit(self, X):
        if not self.autoencoder:
            self._compile(X.shape[1])
        X_train, X_test = train_test_split(X)
        self.his = self.autoencoder.fit(X_train, X_train,
                                        epochs=200,
                                        batch_size=128,
                                        shuffle=True,
                                        validation_data=(X_test, X_test), verbose=0)
        

        plt.figure(figsize=(10, 6), dpi=350)
        plt.plot(self.his.history['loss'], label='Оқыту кезіндегі қателесу')
        plt.plot(self.his.history['val_loss'], label='Тексеру кезіндері қателесу')
        plt.title('LDA_BERT моделі Оқыту кезіндегі қателесу мәндері')
        plt.xlabel('Эпохалар')
        plt.ylabel('Қателеу мәні')
        plt.legend()
        plt.show()

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim import corpora
import gensim
from datetime import datetime
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
import keras
from keras.layers import Input, Dense
from keras.models import Model

def preprocess(docs, samp_size=None):
    """
    Preprocess the data
    """
    if not samp_size:
        samp_size = 100

    print('Preprocessing raw texts ...')
    n_docs = len(docs)
    sentences = []  # sentence level preprocessed
    token_lists = []  # word level preprocessed
    idx_in = []  # index of sample selected
    #     samp = list(range(100))
    samp = np.random.choice(n_docs, samp_size)
    for i, idx in enumerate(samp):
        sentence = preprocess_sent(docs[idx])
        token_list = preprocess_word(sentence)
        if token_list:
            idx_in.append(idx)
            sentences.append(sentence)
            token_lists.append(token_list)
        print('{} %'.format(str(np.round((i + 1) / len(samp) * 100, 2))), end='\r')
    print('Preprocessing raw texts. Done!')
    return sentences, token_lists, idx_in


# define model object
class Topic_Model:
    def __init__(self, k=10, method='TFIDF'):
        """
        :param k: number of topics
        :param method: method chosen for the topic model
        """
        if method not in {'TFIDF', 'LDA', 'BERT', 'LDA_BERT'}:
            raise Exception('Invalid method!')
        self.k = k
        self.dictionary = None
        self.corpus = None
        #         self.stopwords = None
        self.cluster_model = None
        self.ldamodel = None
        self.vec = {}
        self.gamma = 15  # parameter for reletive importance of lda
        self.method = method
        self.AE = None
        self.id = method + '_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

    def vectorize(self, sentences, token_lists, method=None):
        """
        Get vecotr representations from selected methods
        """
        # Default method
        if method is None:
            method = self.method

        # turn tokenized documents into a id <-> term dictionary
        self.dictionary = corpora.Dictionary(token_lists)
        # convert tokenized documents into a document-term matrix
        self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

        if method == 'TFIDF':
            print('Getting vector representations for TF-IDF ...')
            tfidf = TfidfVectorizer()
            vec = tfidf.fit_transform(sentences)
            print('Getting vector representations for TF-IDF. Done!')
            return vec

        elif method == 'LDA':
            print('Getting vector representations for LDA ...')
            if not self.ldamodel:
                self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary,
                                                                passes=20)

            def get_vec_lda(model, corpus, k):
                """
                Get the LDA vector representation (probabilistic topic assignments for all documents)
                :return: vec_lda with dimension: (n_doc * n_topic)
                """
                n_doc = len(corpus)
                vec_lda = np.zeros((n_doc, k))
                for i in range(n_doc):
                    # get the distribution for the i-th document in corpus
                    for topic, prob in model.get_document_topics(corpus[i]):
                        vec_lda[i, topic] = prob

                return vec_lda

            vec = get_vec_lda(self.ldamodel, self.corpus, self.k)
            print('Getting vector representations for LDA. Done!')
            return vec

        elif method == 'BERT':

            print('Getting vector representations for BERT ...')
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer('bert-base-nli-max-tokens')
            vec = np.array(model.encode(sentences, show_progress_bar=True))
            print('Getting vector representations for BERT. Done!')
            return vec

        elif method == 'LDA_BERT':

            vec_lda = self.vectorize(sentences, token_lists, method='LDA')
            vec_bert = self.vectorize(sentences, token_lists, method='BERT')
            vec_ldabert = np.c_[vec_lda * self.gamma, vec_bert]
            self.vec['LDA_BERT_FULL'] = vec_ldabert
            if not self.AE:
                self.AE = Autoencoder()
                print('Fitting Autoencoder ...')
                self.AE.fit(vec_ldabert)
                print('Fitting Autoencoder Done!')
            vec = self.AE.encoder.predict(vec_ldabert)
            return vec

    def fit(self, sentences, token_lists, method=None, m_clustering=None):
        """
        Fit the topic model for selected method given the preprocessed data
        :docs: list of documents, each doc is preprocessed as tokens
        :return:
        """
        # Default method
        if method is None:
            method = self.method
        # Default clustering method
        if m_clustering is None:
            m_clustering = KMeans

        # turn tokenized documents into a id <-> term dictionary
        if not self.dictionary:
            self.dictionary = corpora.Dictionary(token_lists)
            # convert tokenized documents into a document-term matrix
            self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

        ####################################################
        #### Getting ldamodel or vector representations ####
        ####################################################

        if method == 'LDA':
            if not self.ldamodel:
                print('Fitting LDA ...')
                self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary,
                                                                passes=20)
                print('Fitting LDA Done!')
        else:
            print('Clustering embeddings ...')
            self.cluster_model = m_clustering(self.k)
            self.vec[method] = self.vectorize(sentences, token_lists, method)
            self.cluster_model.fit(self.vec[method])
            print('Clustering embeddings. Done!')

    def predict(self, sentences, token_lists, out_of_sample=None):
        """
        Predict topics for new_documents
        """
        # Default as False
        out_of_sample = out_of_sample is not None

        if out_of_sample:
            corpus = [self.dictionary.doc2bow(text) for text in token_lists]
            if self.method != 'LDA':
                vec = self.vectorize(sentences, token_lists)
                print(vec)
        else:
            corpus = self.corpus
            vec = self.vec.get(self.method, None)

        if self.method == "LDA":
            lbs = np.array(list(map(lambda x: sorted(self.ldamodel.get_document_topics(x),
                                                     key=lambda x: x[1], reverse=True)[0][0],
                                    corpus)))
        else:
            lbs = self.cluster_model.predict(vec)
        return lbs

In [None]:
from typing import Counter
from gensim.models.coherencemodel import CoherenceModel
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import umap


plt.figure(figsize=(10, 10),dpi=200)


def plot_proj(embedding, lbs):
    """
    Plot UMAP embeddings
    :param embedding: UMAP (or other) embeddings
    :param lbs: labels
    """
    n = len(embedding)
    counter = Counter(lbs)
    for i in range(len(np.unique(lbs))):
        plt.plot(embedding[:, 0][lbs == i], embedding[:, 1][lbs == i], '.', alpha=0.5,
                 label='cluster {}: {:.2f}%'.format(i, counter[i] / n * 100))
    plt.legend()


def get_coherence(model, token_lists, measure='c_v'):
    """
    Get model coherence from gensim.models.coherencemodel
    :param model: Topic_Model object
    :param token_lists: token lists of docs
    :param topics: topics as top words
    :param measure: coherence metrics
    :return: coherence score
    """
    if model.method == 'LDA':
        cm = CoherenceModel(model=model.ldamodel, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    else:
        topics = get_topic_words(model,  model.k)
        cm = CoherenceModel(topics=topics, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    return cm.get_coherence()

def get_silhouette(model):
    """
    Вычисление силуэта кластеров.
    :param model: объект Topic_Model.
    :return: значение силуэта.
    """
    if model.method == 'LDA':
        return 'N/A'  # Силуэт не применим к LDA напрямую
    lbs = model.cluster_model.labels_
    vec = model.vec[model.method]
    return silhouette_score(vec, lbs)

def visualize(model):

    if model.method == 'LDA':
        print("LDA модель не поддерживает этот тип визуализации.")
        return
    reducer = umap.UMAP()
    vec_umap = reducer.fit_transform(model.vec[model.method])
    plot_proj(vec_umap, model.cluster_model.labels_)
    plt.scatter(vec_umap[:, 0], vec_umap[:, 1], c=model.cluster_model.labels_, cmap='Spectral', s=5)
    plt.colorbar(boundaries=np.arange(model.k+1)-0.5).set_ticks(np.arange(model.k))

    plt.show()

# Функция для получения токенов каждой темы
def get_topic_words(lda_model, num_topics):
    words = []
    for topic_id in range(num_topics):
        top_words = [word for word, prop in lda_model.show_topic(topic_id)]
        words.append(' '.join(top_words))
    return words

# Исправленная функция для генерации облака слов
def get_wordcloud(topic_words, topic_num):
    wordcloud = WordCloud(width=500, height=560, background_color='white', collocations=False).generate(topic_words[topic_num])
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



In [None]:
import pandas as pd



# Загрузка данных
data_path = 'bbc-text.csv'  # Замените на ваш реальный путь к файлу
print("Загрузка данных...")
data = pd.read_csv(data_path)
print("Данные загружены.")

In [None]:

sentences, token_lists = preprocess(data['text'])
print("Предварительная обработка текстов завершена.")

In [None]:

method='LDA_BERT'

# Создание и обучение модели тематического моделирования
print("Начало обучения модели тематического моделирования...")
model = Topic_Model(k=5, method=method)
model.fit(sentences, token_lists)
print("Модель тематического моделирования обучена.")

import pickle

# Предположим, model - это ваша обученная модель (любого типа)
with open(f'{method}_model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [None]:


# Анализ когерентности тем и других метрик
coherence = get_coherence(model, token_lists, measure='c_v')
print(f"Когерентность тем: {coherence}")


In [None]:

# Визуализация результатов (например, с помощью UMAP и облака слов)
print("Визуализация результатов...")
visualize(model)


In [None]:
from gensim.models.coherencemodel import CoherenceModel
from sklearn.metrics import silhouette_score

# Предположим, что у вас есть уже обученные модели и данные для оценки

# Для LDA или LDA_BERT:
lda_coherence = CoherenceModel(model=model, texts=texts, dictionary=dictionary_path, coherence='u_mass').get_coherence()
lda_cv = CoherenceModel(model=model, texts=texts, dictionary=dictionary_path, coherence='c_v').get_coherence()

# Если метод предполагает кластеризацию (TF-IDF + Clustering, BERT + Clustering, LDA_BERT + Clustering):
labels = cluster_model.labels_
silhouette_avg = silhouette_score(X, labels)  # X - это матрица признаков

# Вывод результатов
print(f"LDA U-Mass Coherence: {lda_coherence}")
print(f"LDA C_V Coherence: {lda_cv}")
print(f"Silhouette Score: {silhouette_avg}")

In [None]:
num_topics = 5  

# Получаем слова для каждой темы
topic_words = get_topic_words(model.ldamodel, num_topics)

# Вызываем функцию для генерации облака слов для конкретной темы, например, для первой темы
print("Визуализация облака слов для выбранной темы...")
get_wordcloud(topic_words, topic_num=0)  # Передаем слова темы и номер темы
print("Процесс завершен.")

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel



topic_names = {0:'tech', 1:  'business', 2: 'sport', 3:'entertainment' , 4: 'politics'}

test_texts = [
    # Tech
    "The rapid advancement in quantum computing has the potential to revolutionize industries by making data processing significantly faster.",
    "Emerging technologies such as blockchain and IoT are becoming pivotal in shaping the future landscape of digital transactions and smart homes.",
    
    # Business
    "Global markets are increasingly volatile, with trade tensions and geopolitical uncertainties affecting investor sentiment.",
    "Startups are finding it more challenging to secure funding as venture capitalists tighten their criteria in a post-pandemic economy.",
    
    # Sport
    "The sports world is eagerly anticipating the upcoming Olympics, where new records are expected to be set in various disciplines.",
    "Major League Baseball sees a historic season as a young rookie breaks the long-standing home run record.",
    
    # Entertainment
    "The film industry is seeing a shift towards streaming platforms, which are now premiering blockbuster movies directly to consumers at home.",
    "Virtual reality concerts are gaining popularity, offering an immersive experience for fans to see their favorite artists perform live.",
    
    # Politics
    "Election campaigns are increasingly relying on social media to engage with voters, raising concerns about misinformation and data privacy.",
    "International relations are tense as negotiations stall on climate change initiatives, with major countries failing to agree on emissions targets."
]



# Шаг 1: Предварительная обработка тестовых текстов
processed_test_texts = [preprocess_word(preprocess_sent(text)) for text in test_texts]

# Шаг 2: Преобразование обработанных тестовых текстов в векторы с использованием словаря
test_corpus = [model.dictionary.doc2bow(text) for text in processed_test_texts]

# Шаг 3: Получение распределения тем для тестовых текстов
# Для LDA или LDA_BERT (если используете LDA в качестве части процесса)
if model.method in ['LDA', 'LDA_BERT']:
    test_topics = [model.ldamodel.get_document_topics(bow) for bow in test_corpus]
else:
    raise NotImplementedError("Әдістер тек LDA және LDA_BERT арналған")

for i, topics_distribution in enumerate(test_topics):
    print(f"\nМәтін {i+1}:")
    for topic, prob in topics_distribution:
        # Получаем название темы по её индексу
        topic_name = topic_names.get(topic, f"Белгісіз мәтін {topic}")
        print(f"Тема '{topic_name}': {prob:.4f}")