Демонстрационный ноут по иерархической тематической модели TreeProfitTM на основе "плоской" версии ProfitTM.

Плюсы:
1) Автоматическое определение количества оптимальных тем
2) Поддержка иерархической тематизации
3) Высокая интерпретируемость и декорреляция топиков
4) Скорость инференса
5) Гибкость настроек
6) Не требует много видеопамяти (можно относительно быстро натренировать даже на CPU)

Минусы:
1) Необходимость использовать нейронки на стадии тренировки модели
2) Аггломеративная кластеризация медленная и требует много памяти (можно настроить)

Принцип работы ProfitTM:

Тренировка:
1) Разбиваем вектора текстов с помощью аггломеративной кластеризации
2) Выполняем слияние мелких кластеров с крупными на основе расстояний
3) Тренируем нейронку-классификатор с CenterLoss
4) Выполняем слияние мелких кластеров с крупными на основе размеров кластеров
5) Тренируем нейронку-классификатор с CenterLoss
6) Тренируем быстрый классификатор на получившихся классах (на стадии инференса нейронки не используются)
   
Инференс:
1) Быстрая классификация исходных векторов без нейронок


Создайте виртуальную среду python=3.10 и установите необходимые пакеты:

conda create -n profittm python=3.10
conda activateprofittmv#
pip3 install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio===0.13.0+cu117 -f https://download.pytorch.org/whl/cu117/torch_stable.ht
pip3 install torch==1.13ml
pip install -r requirements.txt
python -m spacy download en_core_web_sm

Импорт зависимостей

In [None]:
from pathlib import Path
from pprint import pprint
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from copy import deepcopy
from datetime import datetime
from tqdm import tqdm

import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd.function import Function
import torch.optim as optim

from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset

from nltk.stem import WordNetLemmatizer
import re
from joblib import Parallel, delayed
import nltk
from nltk.corpus import stopwords
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')

import gc
import os
import multiprocessing as mp
from gensim.models import Word2Vec
from copy import deepcopy
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score
from sklearn.linear_model import SGDClassifier
from scipy.spatial.distance import cosine
from sklearn.preprocessing import OneHotEncoder
from copy import deepcopy
from pprint import pprint
from scipy.spatial.distance import euclidean
from torch.jit import isinstance

import networkx as nx
from numpy import dtype
import uuid

from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import networkx as nx

utils

In [None]:
def save(obj, path, verbose=True):
    if verbose:
        print("Saving object to {}".format(path))

    with open(path, "wb") as obj_file:
        pickle.dump( obj, obj_file, protocol=pickle.HIGHEST_PROTOCOL )

    if verbose:
        print("Object saved to {}".format(path))
    pass

def load(path, verbose=True):
    if verbose:
        print("Loading object from {}".format(path))
    with open(path, "rb") as obj_file:
        obj = pickle.load(obj_file)
    if verbose:
        print("Object loaded from {}".format(path))
    return obj

CenterLossCompressor

In [None]:
class CenterLoss(nn.Module):
    def __init__(self, num_classes, feat_dim, size_average=True):
        super(CenterLoss, self).__init__()
        self.centers = nn.Parameter(torch.randn(num_classes, feat_dim))
        self.centerlossfunc = CenterlossFunc.apply
        self.feat_dim = feat_dim
        self.size_average = size_average

    def forward(self, label, feat):
        batch_size = feat.size(0)
        feat = feat.view(batch_size, -1)
        # To check the dim of centers and features
        if feat.size(1) != self.feat_dim:
            raise ValueError("Center's dim: {0} should be equal to input feature's \
                            dim: {1}".format(self.feat_dim, feat.size(1)))
        batch_size_tensor = feat.new_empty(1).fill_(
            batch_size if self.size_average else 1)
        loss = self.centerlossfunc(
            feat, label, self.centers, batch_size_tensor)
        return loss


class CenterlossFunc(Function):
    @staticmethod
    def forward(ctx, feature, label, centers, batch_size):
        ctx.save_for_backward(feature, label, centers, batch_size)
        centers_batch = centers.index_select(0, label.long())
        return (feature - centers_batch).pow(2).sum() / 2.0 / batch_size

    @staticmethod
    def backward(ctx, grad_output):
        feature, label, centers, batch_size = ctx.saved_tensors
        centers_batch = centers.index_select(0, label.long())
        diff = centers_batch - feature
        # init every iteration
        counts = centers.new_ones(centers.size(0))
        ones = centers.new_ones(label.size(0))
        grad_centers = centers.new_zeros(centers.size())

        counts = counts.scatter_add_(0, label.long(), ones)
        grad_centers.scatter_add_(
            0, label.unsqueeze(1).expand(
                feature.size()).long(), diff)
        grad_centers = grad_centers / counts.view(-1, 1)
        return - grad_output * diff / batch_size, None, grad_centers / batch_size, None


class CenterLossNN(nn.Module):
    def __init__(self, x_shape, n_classes, latent_dim):
        super(CenterLossNN, self).__init__()
        self.conv1_1 = nn.Conv2d(1, 32, kernel_size=2)
        torch.nn.init.kaiming_normal_(self.conv1_1.weight)
        self.bn_1 = nn.BatchNorm2d(32)
        self.prelu1_1 = nn.PReLU()
        self.conv1_2 = nn.Conv2d(32, 64, kernel_size=2)
        torch.nn.init.kaiming_normal_(self.conv1_2.weight)
        self.bn_2 = nn.BatchNorm2d(64)
        self.do_1 = nn.Dropout2d(p=0.2)
        self.prelu1_2 = nn.PReLU()
        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=2)
        torch.nn.init.kaiming_normal_(self.conv2_1.weight)
        self.bn_3 = nn.BatchNorm2d(128)
        self.prelu3_2 = nn.PReLU()
        
        self.preluip1 = nn.PReLU()
        self.ip1 = nn.Linear(128 * (x_shape[2] - 3), latent_dim)
        self.ip2 = nn.Linear(latent_dim, n_classes, bias=True)

    def forward(self, x):
        x = self.prelu1_1(self.bn_1(self.conv1_1(x)))
        x = self.prelu1_2(self.do_1(self.bn_2(self.conv1_2(x))))
        x = self.prelu3_2(self.bn_3(self.conv2_1(x)))
        
        x = torch.flatten(x, start_dim=1, end_dim=-1)
        
        ip1 = self.preluip1(self.ip1(x))
        
        ip2 = self.ip2(ip1)
        ip2 = F.log_softmax(ip2, dim=1)
        
        return ip1, ip2

class MLP_Network(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_layer_dim, hidden_layers_num=2, dropout_rate=0.2):
        super( MLP_Network, self).__init__()
        
        
        def init_weights(m):
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
        
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_layer_dim),
            nn.BatchNorm1d(hidden_layer_dim),
            nn.PReLU()
        )
        self.input_layer.apply(init_weights)
        

        self.middle_layers = []
        for i in range( hidden_layers_num ):
            hidden_layer = nn.Sequential(
                nn.Linear(hidden_layer_dim, hidden_layer_dim),
                nn.BatchNorm1d(hidden_layer_dim),
                nn.Dropout(p=dropout_rate),
                nn.PReLU(),
            )
            hidden_layer.apply(init_weights)
            self.middle_layers.append( hidden_layer )
        self.middle_layers = nn.Sequential( * self.middle_layers )
        
        self.latent_output = nn.Sequential(
            nn.Linear(hidden_layer_dim, hidden_layer_dim),
            #nn.BatchNorm1d(hidden_layer_dim),
            nn.PReLU(),
        )
        self.latent_output.apply(init_weights)
        
        self.classes_output = nn.Linear(hidden_layer_dim, output_dim, bias=True)
        nn.init.kaiming_uniform_(self.classes_output.weight, mode='fan_in', nonlinearity='relu')

        pass

    def forward(self, x):
        x = self.input_layer(x)
        x = self.middle_layers(x)
        
        latent_features = self.latent_output(x)
        
        classes_output = self.classes_output(latent_features)
        classes_output = F.log_softmax(classes_output, dim=1)
        
        
        return latent_features, classes_output

class CompressorTrainDataset(Dataset):
    def __init__(self, x, y, device, stack_count):
        
        self.x = x
        self.y = y
        self.device = device
        self.stack_count = stack_count
        
        pass
    
    def __getitem__(self, id):
        
        if self.stack_count > 1:
            stacked_x = np.vstack([self.x[id] for i in range(self.stack_count)])
            stacked_x = torch.Tensor(stacked_x)
            stacked_x = torch.unsqueeze(stacked_x, dim=0)
        else:
            stacked_x = self.x[id]
            stacked_x = torch.Tensor(stacked_x)

        stacked_x = stacked_x.to(self.device)
        
        y = self.y[id]
        y = torch.tensor(y, dtype=torch.int64)
        y = y.to(self.device)
        
        return stacked_x, y
    
    def __len__(self):
        
        data_length = len(self.y)
        
        return data_length

class CompressorTestDataset(Dataset):
    def __init__(self, x, device, stack_count):
        
        self.x = x
        self.device = device
        self.stack_count = stack_count
        
        pass
    
    def __getitem__(self, id):
        
        if self.stack_count > 1:
            stacked_x = np.vstack([self.x[id] for i in range(self.stack_count)])
            stacked_x = torch.Tensor(stacked_x)
            stacked_x = torch.unsqueeze(stacked_x, dim=0)
        else:
            stacked_x = self.x[id]
            stacked_x = torch.Tensor(stacked_x)
        
        stacked_x = stacked_x.to(self.device)
        
        return stacked_x
    
    def __len__(self):
        
        data_length = len(self.x)
        
        return data_length

class CenterLossCompressor():
    def __init__(self):
        self.model = None
        self.device = torch.device('cuda')

        self.latent_dim = None
        self.n_classes = None
        self.stack_count = None

        pass

    def fit(self, x, y, validation_part=0.05,
            batch_size=100, epochs=100, latent_dim=100, stack_count=1):
        
        self.n_classes = len(np.unique(y))
        self.latent_dim = latent_dim
        self.stack_count = stack_count

        """self.model = CenterLossNN(
            x_shape=(len(x), self.stack_count, len(x[0])),
            n_classes=self.n_classes,
            latent_dim=latent_dim)"""
        
        self.model = MLP_Network(input_dim=len(x[0]), output_dim=self.n_classes, 
                                 hidden_layer_dim=latent_dim, 
                                 hidden_layers_num=2, 
                                 dropout_rate=0.05)
        
        self.model.to(self.device)
        
        self.model.train()

        loss_weight = 1
        nllloss = nn.CrossEntropyLoss().to(self.device)
        centerloss = CenterLoss(self.n_classes, self.latent_dim).to(self.device)
        optimizer4nn = optim.Adam(self.model.parameters(), lr=0.001)
        optimzer4center = optim.Adam(centerloss.parameters(), lr=0.5)
        
        train_dataset = CompressorTrainDataset(x, y, device="cuda", stack_count=stack_count)
        
        compressor_batch_size = len(x) // 100 + 2
        if compressor_batch_size > 256:
            compressor_batch_size = 256
        print("Compressor batch size: {}".format(compressor_batch_size))
        train_dataloader = DataLoader(train_dataset, batch_size=compressor_batch_size, shuffle=True, drop_last=True)

        for epoch in range(epochs):
            for batch_x, batch_y in tqdm(train_dataloader, desc='CenterLossCompressor fit | Epoch {} of {}'.format(epoch+1, epochs)):
                ip1, pred = self.model(batch_x)
                loss = nllloss(pred, batch_y) + loss_weight * centerloss(batch_y, ip1)

                optimizer4nn.zero_grad()
                optimzer4center.zero_grad()
                loss.backward()
                optimizer4nn.step()
                optimzer4center.step()
        torch.cuda.empty_cache()
        pass

    def predict(self, x, batch_size=100):
        
        self.model.eval()
        
        test_dataset = CompressorTestDataset(x, device="cuda", stack_count=self.stack_count)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        features = []
        for x in test_dataloader:
            with torch.no_grad():
                feats, labels = self.model(x)
                feats = feats.data.cpu().numpy()
                features.append(feats)
                batch = x.to('cpu')
                del batch
        features = np.vstack(features)
        torch.cuda.empty_cache()

        return features


StubClassifier

In [None]:
class StubClassifier():
    
    def __init__(self, stub_y):
        
        self.stub_y = stub_y
    
    def predict(self, x):
        
        stub_predicts = [ self.stub_y for i in range(len(x)) ]
        stub_predicts = np.array( stub_predicts )
        
        return stub_predicts
    
    def fit(self, x, y):
        
        return self

SimpleDataPreprocessor

In [None]:
class SimpleDataPreprocessor():

    def __init__(self):
        self.stop_words = ['that', 'a', 'to', 'of', 'which', 'and', 'while', 'in', 'for', 'those', 'their', 'these',
                          'this', 'but', 'howev', 'it', 'also', 'the', 'onli', 'have', 'one', 't', 's', 'v', 'd', 'at', 'has', 'what']
        self.stop_words = self.stop_words + list(set(stopwords.words('english')))
        # for i in range(len(self.stop_words)):
        #    self.stop_words[i] = Stemmer.Stemmer("english").stemWord(self.stop_words[i])
        self.stop_words = list(set(self.stop_words))
        #self.translit = Transliterator()
        self.lemmatizer = WordNetLemmatizer()

        #self.uselessWords = []

    def preproc_doc_string(self, sample):
        doc_string = str(sample)
        doc_string = doc_string.lower()

        #doc_string = list(doc_string)
        # for i in range(len(doc_string)):
        #    doc_string[i] = self.translit.transliterate(doc_string[i])
        #doc_string = "".join(doc_string)

        doc_string = re.sub('[^A-Za-zА-Яа-я\\s\t]+', ' ', doc_string)

        doc_string = doc_string.split()
        for i in range(len(doc_string)):
            doc_string[i] = self.lemmatizer.lemmatize(doc_string[i])
        doc_string = ' '.join(doc_string)

        #doc_string = doc_string.split()
        # for i in range(len(doc_string)):
        #    doc_string[i] = Stemmer.Stemmer("english").stemWord(doc_string[i])
        #doc_string = " ".join(doc_string)

        # remove stopwords
        doc_string = doc_string.split()
        doc_string = ' '.join([i for i in doc_string if i not in self.stop_words])
        #checkWords = deepcopy(doc_string)
        # for word in checkWords:
        #    if word in self.stop_words:
        #        doc_string.remove(word)
        #doc_string = " ".join(doc_string)

        doc_string = re.sub('\n+', ' ', doc_string)
        doc_string = re.sub(' +', ' ', doc_string)
        doc_string = doc_string.strip()
        if doc_string == '' or doc_string == ' ':
            doc_string = '$$$STUB$$$'

        return doc_string

    def prerproc_docs(self, docs, n_jobs, remove_stub_strings=True):
        preprocessed_docs = Parallel(n_jobs, verbose=10)(delayed(self.preproc_doc_string)(doc) for doc in docs)
        #docs = np.hstack(docs)

        return preprocessed_docs

    def get_uniq_text_list(self, preproc_texts):
        uniq_texts = np.hstack([preproc_texts[:, 0], preproc_texts[:, 1]])
        uniq_texts = np.unique(uniq_texts)
        uniq_texts = list(sorted(list(uniq_texts)))
        return uniq_texts


TfidfW2vVectorizer

In [None]:
def identity_tokenizer(text):
    return text

class TfidfW2vVectorizer():
    def __init__(self):
        self.w2vModel = None
        self.w2v_dict = None
        self.tfidf_vectorizer = TfidfVectorizer()  # kaggle_all_the_news
        # self.tfidf_vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer,
        # lowercase=False, stop_words=None)  # kaggle_survey_2020
        pass
    
    def fit(self, corpus, vector_size=100, window=5,
            n_jobs=10, min_count=2, sample=1e-5, epochs=10, sg=0, seed=45):
        
        corpus = SimpleDataPreprocessor().prerproc_docs(corpus, n_jobs, remove_stub_strings=False)
        
        print('Calculating TF-IDF weights')
        self.tfidf_vectorizer.fit(corpus)
        
        print('Fitting W2V model')
        self.make_w2v_dict(corpus, vector_size=vector_size, window=window, n_jobs=n_jobs,
                                    min_count=min_count, sample=sample, epochs=epochs, sg=sg, seed=seed)
        self.cache_vectors = None
        self.use_cache_vectors = False
        return self

    def make_w2v_dict(self, docs, vector_size=128, window=5, n_jobs=10,
                    min_count=1, sample=0, epochs=100, sg=0, seed=45):

        docs = list(docs)
        for i in range(len(docs)):
            docs[i] = docs[i].split()

        self.w2vModel = Word2Vec(
            docs,
            vector_size=vector_size,
            window=window,
            workers=n_jobs,
            min_count=min_count,
            sample=sample,
            epochs=epochs,
            sg=sg,
            seed=seed)
        self.w2v_dict = dict(
            zip(self.w2vModel.wv.index_to_key, self.w2vModel.wv.vectors))
        docs = None
        gc.collect()
        pass

    
    def vectorize_docs(self, docs, use_tfidf=True, n_jobs=8):
        
        def vectorize_batch(docs, tfidf_vectorizer, w2v_dict, verbose=True, use_tfidf=True):
            docs = list(docs)
            tfidf_feats = tfidf_vectorizer.transform(docs)
            for i in range(len(docs)):
                docs[i] = docs[i].split()
    
            doc_vectors = []
            if verbose:
                proc_range = tqdm(range(len(docs)), desc='Vectorizing docs')
            else:
                proc_range = range(len(docs))
    
            tfidf_vocab = tfidf_vectorizer.vocabulary_
            for i in proc_range:
                tmp_vector = []
                sentence_tfidf = tfidf_feats[i].toarray()
                for j in range(len(docs[i])):
                    if docs[i][j] in w2v_dict:
                        if use_tfidf:
                            if docs[i][j] not in tfidf_vocab:
                                continue
                            tfidfInd = tfidf_vocab[docs[i][j]]
                            tfidf = sentence_tfidf[0][tfidfInd]
                            tmp_vector.append(tfidf * w2v_dict[docs[i][j]])
                        else:
                            tmp_vector.append(w2v_dict[docs[i][j]])
                if len(tmp_vector) != 0:
                    tmp_vector = np.array(tmp_vector)
                    tmp_vector = np.mean(tmp_vector, axis=0)
                else:
                    tmp_vector = np.zeros(list(w2v_dict.values())[0].shape)
                doc_vectors.append(tmp_vector)
            return doc_vectors
        
        
        docs = SimpleDataPreprocessor().prerproc_docs(docs, n_jobs, remove_stub_strings=False)
        
        tfidf_vectorizers = []
        w2v_dicts = []
        for i in range( n_jobs ):
            tfidf_vectorizers.append( deepcopy(self.tfidf_vectorizer) )
            w2v_dicts.append( deepcopy(self.w2v_dict) )
        
        doc_batches = []
        batch_size = len(docs) // n_jobs
        for i in range(n_jobs - 1):
            doc_batches.append( docs[i*batch_size : (i+1)*batch_size] )
        doc_batches.append( docs[(n_jobs-1)*batch_size:] )
        
        vectorized_docs = Parallel(n_jobs, verbose=10)(delayed(vectorize_batch)\
                                                       (doc, tfidf_vectorizer, w2v_dict, verbose=True, use_tfidf=use_tfidf) \
                                                       for doc, tfidf_vectorizer, w2v_dict  \
                                                       in zip(doc_batches, tfidf_vectorizers, w2v_dicts))
        del doc_batches
        gc.collect()
        
        vectorized_docs = np.vstack( vectorized_docs )
        
        return vectorized_docs
        


ProfitTM

In [None]:
class ProfitTM():

    def __init__(self, SVM_C=1.0, n_jobs=10, verbose=1, name=None):

        self.feature_extractor = CenterLossCompressor()
        self.classifier = SGDClassifier(n_jobs=n_jobs)
        self.topic_names = None
        self.topic_count = None
        self.name = name

        pass
    
    def fit(self, x, max_agg_elements=30000, target_n_clusters=20, opt_param_dev=0.0, n_optimal_steps=1,
            batch_size=20, base_epochs=25):
        
        if not isinstance(x, np.ndarray):
            x = np.array( x )
        
        x = x.copy()
        
        if len(x) <= target_n_clusters:
            target_n_clusters = 1
        if len(x) == 1:
            x = np.vstack([x, x])
        
        if len(x) > max_agg_elements:
            np.random.seed(45)
            subsample_ids = np.random.choice( list(range(len(x))), size=max_agg_elements, replace=False )
            if not isinstance(subsample_ids, np.ndarray):
                subsample_ids = np.array( subsample_ids )
            x = x[subsample_ids]
        
        self.clusterizer = AgglomerativeClustering(n_clusters=target_n_clusters, linkage='ward')
        self.clusterizer.fit(x)
        clust_train_y = self.clusterizer.labels_
        
        optimal_clust_train_y = self.distance_bazed_cluster_merge(x, clust_train_y, metric='cosine', n_quantiles=20)
        
        if len(np.unique(optimal_clust_train_y)) == 1:
            self.classifier = StubClassifier(stub_y=optimal_clust_train_y[0])
            print("Optimal clusters by distance merge is 1. Place stub 1 class classifier.")
            return self
            pass
        else:
            clust_train_y = optimal_clust_train_y

        self.feature_extractor.fit(
            x,
            clust_train_y,
            batch_size=batch_size,
            epochs=base_epochs)
        x_compressor_features = self.feature_extractor.predict(x)
        self.classifier.fit(x_compressor_features, clust_train_y)

        clust_train_y = self.classifier.predict(x_compressor_features)
        optimal_clust_train_y = self.size_bazed_cluster_merge(x_compressor_features, clust_train_y, small_cluster_threshold=0.04, n_quantiles=20)
            
        if len(np.unique(optimal_clust_train_y)) == 1:
            #self.classifier = StubClassifier(stub_y=optimal_clust_train_y[0])
            print("Skip cluster size merge")
            return self
            #pass
        else:
            clust_train_y = optimal_clust_train_y
        
        
        self.topic_count = len(np.unique(clust_train_y))
        self.feature_extractor.fit(
            x,
            clust_train_y,
            batch_size=batch_size,
            epochs=base_epochs)
        x_compressor_features = self.feature_extractor.predict(x)
        self.classifier.fit(x_compressor_features, clust_train_y)
        
        return self
    
    def get_topic_names(self, x, current_level):

        x = np.array(x)
        topic_dict = {}
        y = self.predict(x)
        uniq_y = np.unique(y)
        for i in range(len(uniq_y)):
            clust_x = x[y == uniq_y[i]]
            
            topic_names = str(current_level) + "_" + str(uniq_y[i])
            
            topic_dict[uniq_y[i]] = topic_names

        return topic_dict
    
    def get_features(self, x):
        
        estimates = self.feature_extractor.predict(x)
        return estimates

    def get_class_estimates(self, x):
        features = self.get_features(x)
        estimates = self.classifier.decision_function(features)
        return estimates

    def predict(self, x):
        
        if isinstance(self.classifier, StubClassifier):
            pred_y = self.classifier.predict(x)
        else:
            estimates = self.get_features(x)
            pred_y = self.classifier.predict(estimates)
            
        return pred_y

    
    def find_optimal_clusters(
            self, x, target_n_clusters, target_n_clustersDev, n_optimal_steps):

        start_n_clusters = int((target_n_clustersDev) * target_n_clusters) + 1
        end_n_clusters = int((1 + (1 - target_n_clustersDev)) * target_n_clusters)
        n_clust = np.linspace(
            start_n_clusters,
            end_n_clusters,
            n_optimal_steps,
            dtype=int)
        best_clusterizer = None
        best_score = -1e20
        best_n_clusters = None
        for i in range(len(n_clust)):
            clusterizer = AgglomerativeClustering(
                n_clusters=n_clust[i], linkage='ward')
            clusterizer.fit(x)
            labels = clusterizer.labels_
            if len(np.unique(labels)) == 1:
                best_clusterizer = deepcopy(clusterizer)
                best_n_clusters = n_clust[i]
                break
            score = calinski_harabasz_score(x, labels=labels)
            print(
                '{} | n_clusters = {} | score = {}'.format(
                    'Stub', n_clust[i], score))
            if score > best_score:
                best_clusterizer = deepcopy(clusterizer)
                best_score = score
                best_n_clusters = n_clust[i]
        print('Best score at {}: {}'.format(best_n_clusters, best_score))

        self.clusterizer = deepcopy(best_clusterizer)
        pass

    def size_bazed_cluster_merge(
            self, x, y, small_cluster_threshold=0.04, n_quantiles=20):
        
        y = y.copy()
        # get size threshold
        x = np.array(x)
        uniq_y = np.unique(y)
        clust_sizes = []
        for i in range(len(uniq_y)):
            clust_x = x[y == uniq_y[i]]
            clust_size = len(clust_x)
            clust_sizes.append(clust_size)
        clust_sizes = np.array(clust_sizes)
        size_quantiles = pd.DataFrame({'clust_size': clust_sizes})
        print('size quantiles')
        pprint(
            pd.qcut(
                size_quantiles['clust_size'],
                n_quantiles,
                duplicates='drop').value_counts().index)
        size_quantiles = list(
            sorted(
                list(
                    pd.qcut(
                        size_quantiles['clust_size'],
                        n_quantiles,
                        duplicates='drop').value_counts().index)))
        #size_quantiles[0].left = abs(size_quantiles[0].left)

        size_threshold = None
        relative_borders = []
        # last max change can be at the end of sorted quantiles
        # define optimal threshold as the max change
        for i in range(len(size_quantiles) - 1):
            relative_border = size_quantiles[i].right / abs(size_quantiles[i].left)
            relative_borders.append(relative_border)
        # if no changes then don't merge
        if len(relative_borders) == 0:
            return y
        max_relative_border_id = np.argmax(relative_borders)
        max_relative_border = relative_borders[max_relative_border_id]
        # if there was no big change between sorted quantile sizes
        # then there are no small trash clusters
        print('Max relative border: {}'.format(max_relative_border))
        size_threshold = size_quantiles[max_relative_border_id].right

        # find small clusters
        merge_dict = {}
        max_cluster_size = max(clust_sizes)
        relative_sizes = []
        for i in range(len(uniq_y)):
            clust_x = x[y == uniq_y[i]]
            clust_size = len(clust_x)
            relative_size = clust_size / max_cluster_size
            relative_sizes.append(relative_size)
            if clust_size <= size_threshold:
                # if relative_size <= small_cluster_threshold:
                merge_dict[uniq_y[i]] = []

        print('relative sizes')
        pprint(relative_sizes)
        print('Max cluster size: {}'.format(max_cluster_size))
        # find nearest big cluster for small cluster
        for small_cluster_id in merge_dict.keys():
            small_cluster = x[y == small_cluster_id]
            small_cluster_center = np.mean(small_cluster, axis=0)
            min_dist = 1e30
            best_big_cluser_id = None
            for big_cluster_id in uniq_y:
                if small_cluster_id == big_cluster_id:
                    continue

                big_cluster = x[y == big_cluster_id]
                big_cluster_size = len(big_cluster)
                if big_cluster_size <= size_threshold:  # don't merge with other small
                    continue

                big_cluster_center = np.mean(big_cluster, axis=0)
                dist = cosine(small_cluster_center, big_cluster_center)
                if dist < min_dist:
                    best_big_cluser_id = big_cluster_id
            merge_dict[small_cluster_id].append(best_big_cluser_id)
        pprint(merge_dict)

        optimal_y = self.merge_clusters(y, merge_dict)
        return optimal_y

    def distance_bazed_cluster_merge(
            self, x, y, metric='cosine', n_quantiles=20):
            
        # get centers of each cluster as mean of top N words closest to center
        y = y.copy()
        x = np.array(x)
        uniq_y = np.unique(y)
        centers = []
        for i in range(len(uniq_y)):
            clust_x = x[y == uniq_y[i]]
            clusters_center = np.mean(clust_x, axis=0)
            centers.append(clusters_center)
        centers = np.array(centers)

        # get distance threshold for merging
        distances = []
        for i in range(len(centers)):
            for j in range(len(centers)):
                if i == j:
                    continue
                if metric == 'cosine':
                    dist = cosine(centers[i], centers[j])
                else:
                    dist = euclidean(centers[i], centers[j])
                distances.append(dist) 
        if len(distances) == 0:
            return y
        distances = pd.DataFrame({'dist': distances})
        
        pprint(pd.qcut(distances['dist'], n_quantiles, duplicates='drop'))
        distance_quantiles = list(
            sorted(
                list(
                    pd.qcut(
                        distances['dist'],
                        n_quantiles,
                        duplicates='drop').value_counts().index)))

        # if no quantiles then don't merge
        if len(distance_quantiles) == 0:
            return y

        #####################################
        distance_threshold = None
        relative_borders = []
        # last max change can be at the end of sorted quantiles
        # define optimal threshold as the max change
        for i in range(len(distance_quantiles) - 1):
            relative_border = distance_quantiles[i].right / abs(distance_quantiles[i].left)
            relative_borders.append(relative_border)
        # if no changes then don't merge
        if len(relative_borders) == 0:
            return y
        max_relative_border_id = np.argmax(relative_borders)
        max_relative_border = relative_borders[max_relative_border_id]
        # if there was no big change between sorted quantile sizes
        # then there are no small trash clusters
        print('Max relative distance border: {}'.format(max_relative_border))
        distance_threshold = distance_quantiles[max_relative_border_id].right
        #####################################

        #####################################
        #distance_threshold = distance_quantiles[0].right
        #####################################

        # get clusters which centers are closer than distance threshold
        merge_dict = {}
        for i in range(len(centers)):
            merge_dict[uniq_y[i]] = []
            for j in range(len(centers)):
                if i == j:
                    continue
                if metric == 'cosine':
                    dist = cosine(centers[i], centers[j])
                else:
                    dist = euclidean(centers[i], centers[j])
                if dist <= distance_threshold:
                    merge_dict[uniq_y[i]].append(uniq_y[j])
            if len(merge_dict[uniq_y[i]]) == 0:
                merge_dict[uniq_y[i]].append(-1)

        optimal_y = self.merge_clusters(y, merge_dict)
        return optimal_y

    def merge_clusters(self, y, merge_dict):
        # get initial merge components list
        merge_list = []
        for key in merge_dict.keys():
            if -1 not in merge_dict[key]:
                merge_component = []
                merge_component.append(key)
                for cluster_to_merge in merge_dict[key]:
                    merge_component.append(cluster_to_merge)
                merge_component = list(sorted(merge_component))
                merge_list.append(merge_component)
        uniq_components = []
        for merge_component in merge_list:
            if merge_component not in uniq_components:
                uniq_components.append(merge_component)
        merge_list = uniq_components
        merge_list = list(sorted(merge_list, key=len, reverse=True))
        print(merge_list)

        # clean merge list
        components_lens = set()
        for merge_component in merge_list:
            components_lens.add(len(merge_component))
        components_lens = list(sorted(list(components_lens), reverse=True))
        for k in range(len(components_lens[:len(components_lens) - 1])):
            target_components = []
            target_idx = []
            sub_components = []
            subset_idx = []
            for i in range(len(merge_list)):
                if len(merge_list[i]) == components_lens[k]:
                    target_components.append(set(merge_list[i]))
                    target_idx.append(i)
                elif len(merge_list[i]) <= 1:
                    pass
                elif len(merge_list[i]) in components_lens[k + 1:]:
                    sub_components.append(set(merge_list[i]))
            for i in range(len(target_components)):
                for j in range(len(sub_components)):
                    intersect = sub_components[j].intersection(target_components[i])
                    if sub_components[j] == intersect:
                        subset_idx.append(j)
            subset_idx = list(set(subset_idx))
            for i in range(len(target_components)):
                for j in range(len(subset_idx)):
                    target_components[i] = target_components[i].difference(
                        sub_components[subset_idx[j]])
            for i in range(len(target_idx)):
                merge_list[target_idx[i]] = target_components[i]

        for i in range(len(merge_list)):
            merge_list[i] = set(merge_list[i])
        tmp = []
        for i in range(len(merge_list)):
            if merge_list[i] not in tmp and len(merge_list[i]) > 1:
                tmp.append(merge_list[i])
        merge_list = tmp
        print(merge_list)

        # get final clean components
        clean_components = []
        for i in range(len(merge_list)):
            current_component = set()
            for j in range(i, len(merge_list)):
                if current_component == set():
                    current_component = current_component.union(merge_list[j])
                else:
                    if current_component.intersection(merge_list[j]) != set():
                        current_component = current_component.union(
                            merge_list[j])
            not_in_flag = True
            for i in range(len(clean_components)):
                if current_component.intersection(
                        clean_components[i]) != set():
                    not_in_flag = False
            if not_in_flag:
                clean_components.append(current_component)
        print(clean_components)

        # set new labels
        for opt_set_y in clean_components:
            min_y = min(opt_set_y)
            for i in range(len(y)):
                if y[i] in opt_set_y:
                    y[i] = min_y
        uniq_y = np.unique(y)
        old_new_dict = {}
        for i in range(len(uniq_y)):
            old_new_dict[uniq_y[i]] = i
        for i in range(len(y)):
            y[i] = old_new_dict[y[i]]
        optimal_y = y

        uniq_y = np.unique(optimal_y)
        print('Optimal topics count = {}'.format(len(uniq_y)))

        return optimal_y


TreeProfitTM

In [None]:
class TreeProfitTM():

    def __init__(self, max_depth=None, current_level=0, parents_name=None):
        self.node = None
        self.childs = {}
        self.topic_names = None
        self.topic_count = None
        self.max_depth = max_depth
        self.current_level = current_level
        self.tree_name = str(uuid.uuid4())

        if current_level == 0 and parents_name is None:
            self.isRoot = True
        else:
            self.isRoot = False

        pass

    def fit(self, x):

        if self.current_level == 0:
            self.node = ProfitTM()
            self.node.fit(x)
        else:
            self.node.fit(x)
        
        self.topic_names = self.node.get_topic_names(x, current_level=self.current_level)
        self.topic_count = self.node.topic_count

        if self.current_level + 1 < self.max_depth:
            y = self.node.predict(x)
            uniq_y = np.unique(y)
            topic_docs = {}
            for topic in uniq_y:

                topic_docs[topic] = []
                for i in range(len(y)):
                    if y[i] == topic:
                        topic_docs[topic].append(x[i])

                self.childs[topic] = TreeProfitTM(self.max_depth, self.current_level + 1)
                self.childs[topic].node = ProfitTM()

        if self.current_level + 1 < self.max_depth:
            for topic in topic_docs.keys():
                self.childs[topic].fit(topic_docs[topic])
        pass

    def predict(self, x, return_vectors=False):

        shared_predicts = self.prepare_to_predict(x)
        all_row_ids = [i for i in range(len(x))]
        all_row_ids = np.array( all_row_ids )
        shared_predicts = self.hierarchical_predict(x, all_row_ids, shared_predicts, pred_ids=None)

        if return_vectors:
            shared_predicts = self.convert_predicts_to_vectors(shared_predicts)

        return shared_predicts

    def prepare_to_predict(self, x):
        
        shared_predicts = np.zeros( shape=(len(x), self.max_depth), dtype=np.int32 )
        shared_predicts = shared_predicts + np.nan
        
        return shared_predicts

    def hierarchical_predict(self, text_vectors, all_row_ids, shared_predicts, pred_ids=None):

        if self.current_level < self.max_depth:

            if pred_ids is None:
                next_text_vectors_batch = text_vectors
            else:
                next_text_vectors_batch = text_vectors[pred_ids]
            y = self.node.predict(next_text_vectors_batch)
            
            
            if self.current_level == 0:
                shared_predicts[:, 0] = y
            else:
                shared_predicts[pred_ids, self.current_level] = y
                
            uniq_y = np.unique(y)
            for topic in uniq_y:
                
                if pred_ids is None:
                    next_ids = all_row_ids[y == topic]
                else:
                    next_ids = all_row_ids[pred_ids][y == topic]
                
                if len(self.childs.keys()) == 0:
                    #self.leaf_predict(text_vectors, shared_predicts, next_ids)
                    pass
                else:
                    self.childs[topic].hierarchical_predict(text_vectors, all_row_ids, shared_predicts, next_ids)
        
        return shared_predicts
    
    def extract_features(self, x):

        shared_predicts = self.prepare_to_feature_extraction(x)
        all_row_ids = [i for i in range(len(x))]
        all_row_ids = np.array( all_row_ids )
        shared_predicts = self.hierarchical_feature_extraction(x, all_row_ids, shared_predicts, pred_ids=None)

        return shared_predicts

    def prepare_to_feature_extraction(self, x):
        
        shared_predicts = np.zeros( shape=(len(x), self.max_depth * self.node.feature_extractor.latent_dim), dtype=np.float64 )
        shared_predicts = shared_predicts + np.nan
        
        return shared_predicts

    def hierarchical_feature_extraction(self, text_vectors, all_row_ids, shared_predicts, pred_ids=None):

        if self.current_level < self.max_depth:

            if pred_ids is None:
                next_text_vectors_batch = text_vectors
            else:
                next_text_vectors_batch = text_vectors[pred_ids]
                
            y = self.node.predict(next_text_vectors_batch)
            topic_level_features = self.node.get_features(next_text_vectors_batch)
            
            insert_size = self.node.feature_extractor.latent_dim
            if self.current_level == 0:
                shared_predicts[:, : insert_size] = topic_level_features
            else:
                shared_predicts[pred_ids, self.current_level * insert_size : (self.current_level + 1) * insert_size] = topic_level_features
                
            uniq_y = np.unique(y)
            for topic in uniq_y:
                
                if pred_ids is None:
                    next_ids = all_row_ids[y == topic]
                else:
                    next_ids = all_row_ids[pred_ids][y == topic]
                
                if len(self.childs.keys()) == 0:
                    #self.leaf_predict(text_vectors, shared_predicts, next_ids)
                    pass
                else:
                    self.childs[topic].hierarchical_feature_extraction(text_vectors, all_row_ids, shared_predicts, next_ids)
        
        return shared_predicts

TopicInterpreter

In [None]:
class TopicInterpreter():
    
    def __init__(self, vectorizer=None):
        
        self.vectorizer = vectorizer
        
        pass
    
    def fit(self, corpus, vector_size=100, window=5,
            n_jobs=10, min_count=2, sample=1e-5, epochs=10, sg=0, seed=45):
        
        self.vectorizer = TfidfW2vVectorizer()
        self.vectorizer.fit(corpus, vector_size, window, n_jobs, min_count, sample, epochs, sg, seed)
        
        pass
    
    def make_topic_dict(self, text_vectors, topic_labels, level, n_jobs=8):
        x = np.array(text_vectors)
        topic_dict = {}
        topic_labels = self.encode_topic_labels(topic_labels, level)
        y = topic_labels
        uniq_y = np.unique(y)
        centers = []
        for i in range(len(uniq_y)):
            clust_x = x[y == uniq_y[i]]
            clusters_center = np.mean(clust_x, axis=0)
            centers.append(clusters_center)
            most_similar = self.vectorizer.w2vModel.wv.most_similar(positive=clusters_center, topn=5)
            
            topic_name = []
            for j in range(len(most_similar)):
                topic_name_part = most_similar[j][0]
                topic_name.append( topic_name_part )
            topic_name = " ".join(topic_name)
            
            topic_dict[uniq_y[i]] = topic_name

        return topic_dict
    
    def get_topic_names(self, texts, topic_labels, level, n_jobs=8):
        
        x = self.vectorizer.vectorize_docs(texts, use_tfidf=True, n_jobs=n_jobs)
        topic_dict = self.make_topic_dict(x, topic_labels, level, n_jobs=8)

        return topic_dict
    
    def encode_topic_labels(self, topic_labels, level=None):
        
        topic_labels = topic_labels.copy()
        
        if level is None:
            level = topic_labels.shape[1] - 1
        
        merged_labels = []
        for i in range(len(topic_labels)):
            merged_label = []
            for j in range(level+1):
                merged_label.append(str(topic_labels[i][j])) 
            merged_label = "_".join(merged_label)
            
            merged_labels.append(merged_label)
        
        merged_labels = np.array( merged_labels )
        
        return merged_labels

Так как ноут демонстрационный, все чтение и сохрание данных происходит в рамках одной папки. Поместите тестовый .xlsx в одну папку с ноутом. Метки топиков будут лежать там же.

Чтение данных

In [None]:
file_name = "articles1"
parsed_data = pd.read_excel( Path("data", "{}.xlsx".format(file_name)), names=["text"] )

Тренировка векторизатора текстов (W2V + TF-IDF веса)

In [None]:
vectorizer = TfidfW2vVectorizer()
train_texts = list( set( parsed_data["text"].to_list() ) )
random.seed(45)
random.shuffle( train_texts )
vectorizer.fit(train_texts, vector_size=384, window=5,
            n_jobs=8, min_count=2, sample=1e-5, epochs=100, sg=0, seed=45)
save( vectorizer, Path(".", "vectorizer.pkl") )

Получение векторов текстов

In [None]:
vectorizer = load( Path(".", "vectorizer.pkl") )
texts_list = parsed_data["text"].to_list()
vectorized_texts = vectorizer.vectorize_docs(texts_list, use_tfidf=True, n_jobs=8)
save( vectorized_texts, Path(".", "vectorized_texts.pkl") )

Тренировка тематической модели

In [None]:
vectorized_texts = load( Path(".", "vectorized_texts.pkl") )
topic_model = TreeProfitTM( max_depth=2 )
topic_model.fit(vectorized_texts)
save( topic_model, Path(".", "topic_model.pkl") )

Получение меток топиков

In [None]:
topic_model = load( Path( ".", "topic_model.pkl") )
vectorized_texts = load( Path(".", "vectorized_texts.pkl") )
topic_labels = topic_model.predict( vectorized_texts, return_vectors = False )
save( pred_y, Path(".", "topic_labels.pkl") )

In [None]:
topic_interpreter = TopicInterpreter()
topic_interpreter.fit(texts_list, vector_size=384, window=5, n_jobs=8, min_count=2, sample=1e-5, epochs=100, sg=0, seed=45)
save( topic_interpreter, Path(".", "topic_interpreter.pkl") )

topic_names_0 = topic_interpreter.get_topic_names(texts_list, topic_labels, level=0, n_jobs=8)
pprint( topic_names_0 )
topic_names_1 = topic_interpreter.get_topic_names(texts_list, topic_labels, level=1, n_jobs=8)
pprint( topic_names_1 )