In [None]:
from typing import Optional
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans, KMeans, AgglomerativeClustering, Birch, SpectralClustering, DBSCAN
from sklearn.utils import class_weight
from natasha import NamesExtractor, MorphVocab
from scipy.sparse import hstack, vstack
from collections import Counter
import copy
import tensorflow as tf
import tensorflow_text
import tensorflow_hub
import hdbscan
import scipy.sparse
import torch
import re
import pandas as pd
import numpy as np
import pymorphy2
import os
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Удаление мусорных символов и пунктуации
def remove_trash(text: str) -> str:
    pattern = '[^А-Яа-яЁё0-9 ]+'
    text = re.sub(pattern, '', text)
    return text

morph = pymorphy2.MorphAnalyzer()

# Лемматизация
def lemmatize(text: str) -> str:
    t = []
    for word in text.split():
        if len(word)<3:
            continue
        p = morph.parse(word)[0]
        t.append(p.normal_form)
    return " ".join(t)

# Функция получения списка монограмм, биграмм и триграмм
def get_gramms(series: pd.Series) -> list:
    text_clean = copy.deepcopy(series.values.tolist())
    text_clean = [sentence.split() for sentence in text_clean]
    
    bigramm = Phrases(text_clean) # Создаем биграммы на основе корпуса
    trigram = Phrases(bigramm[text_clean])# Создаем триграммы на основе корпуса
    
    for idx in range(len(text_clean)):
        gramms = set()
        for token in bigramm[text_clean[idx]]:
            if '_' in token:
                # биграмма, добавим в документ
                gramms.update([token])
                break
        for token in trigram[bigramm[text_clean[idx]]]:
            if '_' in token:
                # триграмма, добавим в документ
                gramms.update([token])
        text_clean[idx].extend(list(gramms))
    return [gramm.replace('_', ' ') for sentence in text_clean for gramm in sentence if gramm != 'не']

# Удаление 100 наиболее встречающихся слов и словосочетаний
def remove_most_common(series: pd.Series, gramms: list) -> pd.Series:
    stop_words = Counter(gramms).most_common(100)
    pattern = r'\b' + r'\b|\b'.join([x[0].lower() for x in stop_words]) + r'\b'
    return series.apply(lambda row: re.sub(pattern, '', row.lower()))

In [None]:
torch.cuda.is_available()

# Preprocessing

In [None]:
df = pd.read_csv('data.csv', sep='|')

In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
# Удаление мусора и лемматизация
df['q_edit'] = df['question'].apply(remove_trash)
df['q_edit'] = df['question'].apply(lemmatize)
df['a_edit'] = df['answer'].apply(remove_trash)
df['a_edit'] = df['answer'].apply(lemmatize)

In [None]:
# Удаление наиболее часто встречающихся слов и словосочетаний для вопросов клиентов
gramms = get_gramms(df['q_edit'])
df['q_edit'] = remove_most_common(df['q_edit'], gramms)

In [None]:
# Удаление наиболее часто встречающихся слов и словосочетаний для ответов консультантов
gramms = get_gramms(df['a_edit'])
df['a_edit'] = remove_most_common(df['a_edit'], gramms)

In [None]:
# Удаление пустых строк
df = df[(df['a_edit'] != '') & (df['q_edit'] != '')]

In [None]:
df.to_csv('data.csv', sep=';', encoding='utf-8')

# Bert

In [None]:
# model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
# model = SentenceTransformer('average_word_embeddings_komninos')
# model = SentenceTransformer('saverage_word_embeddings_levy_dependency')
# model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
# model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
# model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
# model = SentenceTransformer('quora-distilbert-multilingual_part')
# model = SentenceTransformer('stsb-xlm-r-multilingual_part')
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

In [None]:
# Получение эмбедингов для вопросов
questions_emb = model.encode(df['q_edit'])
# Получение эмбедингов для ответов
answers_emb = model.encode(df['a_edit'])

In [None]:
# Сохранение эмбедингов вопросов на жесткий диск
questions_pickle = open('questions_pickle', 'wb')
pickle.dump(questions_emb, questions_pickle)
questions_pickle.close()

In [None]:
# Сохранение эмбедингов ответов на жесткий диск
answers_pickle = open('answers_pickle', 'wb')
pickle.dump(answers_emb, answers_pickle)
answers_pickle.close()

In [None]:
# Загрузка эмбедингов вопросов
questions_pickle = open('questions_pickle', 'rb')
questions_emb = pickle.load(questions_pickle)

In [None]:
# Загрузка эмбедингов ответов
answers_pickle = open('answers_pickle', 'rb')
answers_emb = pickle.load(answers_pickle)

# Clustering

In [None]:
_n_clusters = 200
clmethod = MiniBatchKMeans(n_clusters=_n_clusters, random_state=42)
# clmethod = KMeans(n_clusters=_n_clusters, random_state=42)
# clmethod = AgglomerativeClustering(n_clusters=_n_clusters, linkage="ward")
# clmethod = AgglomerativeClustering(n_clusters=_n_clusters, linkage="average", affinity="euclidean")
# clmethod = Birch(n_clusters=_n_clusters)
# clmethod = SpectralClustering(n_clusters=_n_clusters, eigen_solver=None, random_state=42, affinity="rbf", assign_labels="discretize", n_jobs=psutil.cpu_count())
# clmethod = DBSCAN(min_samples=10, eps=0.9)
# clmethod = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None, p=None)

In [None]:
q_clids = clmethod.fit_predict(questions_emb)
a_clids = clmethod.fit_predict(answers_emb)

In [None]:
df['q_cluster'] = q_clids
df['a_cluster'] = a_clids

In [None]:
# Подсчет весов отношений ответов и кластеров вопросов
weights = []
for i in range(200):
    weight = {x: 0 for x in range(200)}
    cluster_i = df[df['q_cluster'] == i]
    for item in tqdm(range(cluster_i.shape[0])):
        deal_number = cluster_0.iloc[item]['deal']
        date = cluster_0.iloc[item]['created_at']
        try:
            cluster_number = cons_answ[(cons_answ['deal'] == deal_number) & (cons_answ['created_at'] >= date)]['cluster'][:1].values[0]
            weight[int(cluster_number)] += 1
        except IndexError:
            continue
    weight = {x: y / cluster_i.shape[0] for x, y in weight.items() if y != 0}
    weights.append(weight)

In [None]:
# Посмотрим первые 20 предложений, наиболее близких к центроиде кластера 0
distances = np.sqrt(np.sum(np.square(questions_emb - clmethod.cluster_centers_[0]), axis=1))
indexes = [x[0] for x in sorted(enumerate(distances), key=lambda x: x[1])[:20]]
df.iloc[indexes]['question']

# Не подошло

### TF-IDF

In [None]:
stop_russian = stopwords.words('russian')

text_transformer = TfidfVectorizer(stop_words=stop_russian)
text = text_transformer.fit_transform(df["question"])

### USE

In [None]:
use_model = tensorflow_hub.load(r"universal-sentence-encoder-multilingual_3")

In [None]:
# Получение эмбедингов для вопросов
questions_emb = use_model(df['q_edit'])

In [None]:
# Получение эмбедингов для ответов
answer_emb = use_model(df['a_edit'])

In [None]:
# cons_answ_emb = use_model(cons_answ['message'][:5000])
# for i in range(1, cons_answ['message'].shape[0] // 5000):
#     cons_answ_emb2 = use_model(cons_answ['message'][i*5000:(i+1)*5000])
#     cons_answ_emb = tf.concat([cons_answ_emb, cons_answ_emb2], 0)
# if (i+1)*5000 < cons_answ['message'].shape[0]:
#     cons_answ_emb2 = use_model(cons_answ['message'][(i+1)*5000:])
#     cons_answ_emb = tf.concat([cons_answ_emb, cons_answ_emb2], 0)
# cons_answ_emb = np.array(cons_answ_emb)