In [66]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search, cos_sim

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier


from utils.operate_data import load_dataset as dd_load_dataset
from utils.autocomplete import yolo_utils as yolo

from tqdm import tqdm
tqdm.pandas()

%matplotlib inline

# get data

In [2]:
df_meta, _ = dd_load_dataset('/local_data/meta_data/', df_name = 'meta.pickle', files_df_name=None)
df_meta.reset_index(inplace=True)

df_prepared = pd.read_pickle('/local_data/meta_data/prepared_files.pickle')
df_prepared = df_prepared[df_prepared.extension == '.pdf']
df_prepared = df_prepared[df_prepared.content.agg(len) != 0]
df_prepared = df_prepared.loc[df_prepared.groupby('doc_id')['order'].idxmax()]
df_prepared.rename(columns={'content': 'text'}, inplace=True)

df_content = df_prepared.merge(df_meta, on='doc_id')
df_content = df_content[['doc_id', 'text', 'content']]

In [3]:
# sentence model 'bert-base-nli-mean-tokens'
model_sentence = SentenceTransformer('/local_data/models/text_matching/')

model_summarization = AutoModelForSeq2SeqLM.from_pretrained( '/local_data/summarization/models/csebuetnlp_mT5_m2o_russian_crossSum/')
tokenizer = AutoTokenizer.from_pretrained('/local_data/summarization/data/csebuetnlp_mT5_m2o_russian_crossSum')

# Train

## get summarization

In [4]:
def summarizer(text: str, tokenizer: AutoTokenizer, model = AutoModelForSeq2SeqLM) -> str:
    
    inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_new_tokens=200, do_sample=False)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# при необходимости получить суммаризацию. для экспериментов достаточно пользоваться фичёй content

# df_content['predict_summary'] = df_content['text'].progress_apply(lambda row: summarizer(row, tokenizer, model_summarization))

## get embeddings

In [115]:
%%time
corpus = df_content['content'].to_list()
# корпус эмбедингов для обучения модели
corpus_embeddings = model_sentence.encode(corpus, convert_to_tensor=True)
# для каждого документа создадим эмбединг и сохраним его в df формате torch.tensor
df_content['embedding'] = df_content['content'].apply(lambda row: model_sentence.encode([row], convert_to_tensor=True))

CPU times: user 2h 40min 36s, sys: 1min 43s, total: 2h 42min 19s
Wall time: 10min 13s


In [81]:
corpus_embeddings.shape

torch.Size([8058, 768])

## set clusters

In [6]:
%%time
# изменить фичу content на predict_summary при необходимости

# Normalize the embeddings to unit length
np_corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

CPU times: user 3.85 ms, sys: 3.69 ms, total: 7.55 ms
Wall time: 6.65 ms


In [7]:
%%time
# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0.04,  
                                           affinity = 'cosine', linkage ='average')
clustering_model.fit(np_corpus_embeddings)
cluster_assignment = clustering_model.labels_

CPU times: user 6.48 s, sys: 95.9 ms, total: 6.58 s
Wall time: 6.58 s


In [10]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

In [11]:
# df_content['cluster'] = clustering_model.fit_predict(np_corpus_embeddings)

In [12]:
print(f'количество документов: {df_content.shape[0]},\nколичество кластеров: {len(clustered_sentences.items())}')

количество документов: 8058,
количество кластеров: 1084


### Set only big clusters

In [82]:
df_clusters = pd.DataFrame(corpus_embeddings)
df_clusters['doc_id'] = df_content['doc_id']
df_clusters['cluster'] = cluster_assignment
df_clusters.columns = df_clusters.columns.map(str)

In [83]:
df_clusters.cluster.nunique()

1084

In [84]:
# создание списка кластеров имеющих менее "x" экземпляров

x = 10
num_clusters = df_clusters.cluster.value_counts().to_dict()

single_clusters = []
for key, val in num_clusters.items():
    if val <= x:
        single_clusters.append(key)
len(single_clusters)

992

In [85]:
# кластеры имеющие количество экземпляров менее 20 объединяются в один кластер
last_cluster = len(clustered_sentences.items()) + 1 # нэйминг последнего объеденённого кластера
df_clusters['cluster'] = df_clusters['cluster'].apply(lambda row: last_cluster  if row in single_clusters else row)

# назначение кластеров для исходного датафрейма
# df_content['cluster'] = cluster_assignment
# df_content['cluster'] = df_content['cluster'].apply(lambda row: last_cluster  if row in single_clusters else row)

In [87]:
df_clusters.cluster.nunique()

93

## train model for clusters predict

### train test split

In [91]:
features = df_clusters.loc[:, ~df_clusters.columns.isin(['cluster', 'doc_id'])]
target = df_clusters['cluster']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

### train model

In [28]:
cat = CatBoostClassifier(random_state=42)

In [29]:
%%time
cat.fit(X_train,y_train, verbose=False, plot=False)

CPU times: user 17h 45min 7s, sys: 1min 6s, total: 17h 46min 14s
Wall time: 46min 45s


<catboost.core.CatBoostClassifier at 0x7f453b187b80>

In [30]:
predicts = cat.predict(X_test)

In [31]:
precision_score(y_test, predicts, average='weighted'), recall_score(y_test, predicts, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.8221921882631194, 0.8134822167080231)

In [32]:
cat.save_model('/local_data/models/text_matching/catboost_clusters.cbm')

# Predict

## get query document

для исключения утечек в предсказание кластера, исследуются только документы входящие в тестовую выборку

In [92]:
test_indexes = X_test.index.to_list()

In [116]:
random_index = 5
doc_id = df_content.iloc[test_indexes[random_index]]['doc_id']
query = df_prepared[df_prepared.doc_id == doc_id]['text'].values[0]

## get_summarization

In [117]:
%%time
summary = [summarizer(query, tokenizer, model_summarization)]
summary

CPU times: user 4min 3s, sys: 159 ms, total: 4min 3s
Wall time: 21.3 s


['О 9-м заседании межправительственной российско-македонской комиссии торгово-экономического научно-технического сотрудничества']

## get embedding summary df

In [95]:
sentence_embedding = model_sentence.encode(summary, convert_to_tensor=True)

In [96]:
df_sentence = pd.DataFrame(sentence_embedding)
df_sentence.columns = df_sentence.columns.map(str)

In [97]:
df_sentence

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.815912,0.444192,0.759135,0.43191,1.408002,-0.139997,-0.069306,0.727926,-0.252959,0.063424,...,-0.385519,-1.170283,0.424569,0.128625,-0.255495,-1.378363,-0.272698,-0.172365,0.408232,0.502972


## predict cluster

In [98]:
model = CatBoostClassifier()      # parameters not required.
cat = model.load_model('/local_data/models/text_matching/catboost_clusters.cbm')

In [100]:
predict_cluster = cat.predict(df_sentence).item()
predict_cluster

1085

## take data from the predicted cluster

In [113]:
%%time
df_cls = df_clusters[df_clusters.cluster == predict_cluster].reset_index()

# corpus_cluster = df_cls['content'].to_list()
# embeddings_corpus = model_sentence.encode(features, convert_to_tensor=True)

CPU times: user 13.7 ms, sys: 6 µs, total: 13.7 ms
Wall time: 12 ms


## semantic search

In [78]:
%%time
top = semantic_search(sentence_embedding, embeddings_corpus, score_function=cos_sim, top_k= 5)[0]
matching = {}
for doc in top:
    doc_id = df_cls.iloc[doc['corpus_id']]['doc_id']
    matching[doc_id] = doc['score']

CPU times: user 1.85 ms, sys: 6 µs, total: 1.86 ms
Wall time: 1.46 ms


In [79]:
matching

{'d9557575-901f-4f9d-8143-02232f1807df': 0.9909748435020447,
 'a0641dff-85dd-4583-a667-fd52172cb8dc': 0.9893757104873657,
 '438ccbab-bb23-4f90-a545-1d5b1f61f8b7': 0.9857959747314453,
 '7f1b2109-2bbb-450c-8bfa-bface99eaff4': 0.9769965410232544,
 '238ce70a-7f7c-47f1-b2b6-0f006d5ffde0': 0.9741600751876831}

# show documents

In [63]:
df_cls

Unnamed: 0,index,doc_id,text,content,cluster
0,88,02f3049d-6989-44df-85c1-be9d91344b09,департамент планирования территориального разв...,О согласовании проекта внесения изменений в ге...,194
1,97,0327b03d-ec0d-4e5a-abfe-77eded58c00b,департамент планирования россии территориально...,О согласовании проекта внесения изменений в ге...,194
2,116,03c5beee-b00f-4fe0-a80f-f43103223c65,российская едерация самарская область вольш сч...,О проекте изменений в Генеральный план,194
3,159,04eae954-bbab-499c-b688-76477232a50d,федеральная служба охраны российской федерации...,О согласовании проектов внесения изменений в г...,194
4,164,052302de-b70c-481c-98e2-7b482f347c8a,стер ство ергети российской федерации энерго р...,О направлении информации о внесении изменений ...,194
...,...,...,...,...,...
97,7652,f2ef995c-3676-4b81-b756-3235c226cf3d,департамент планирования российской федерации ...,О согласовании проекта внесения изменений в ге...,194
98,7799,f70f608f-9522-4cd5-ab27-612b993b16ca,администрация сельского поселения старое вечка...,О согласовании проекта внесения изменений в Ге...,194
99,7972,fcff336d-8f17-4fd1-a0eb-986b0f4b1d5f,федеральная служба департамент планирования те...,О вопросах согласования проекта внесения измен...,194
100,8036,ff52c5f0-d059-458c-80fc-66d18ae88785,министерство экономического развития российско...,о проекте генерального плана сельского поселен...,194
