In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!python -m pip install pymystem3
!python -m pip install emoji
!python -m pip install pandarallel
!python -m pip install sumy



In [3]:
import warnings, string, emoji
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

import nltk, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
messages = pd.read_csv("/content/drive/MyDrive/meds_public_messages.csv", header=None)

heads = ['id', 'text', 'sent', 'deadline', "is_doctor_message", 'contract_id', 'created_at', 'updated_at',
    "is_answered", "warning_sent", "deadline_sent", 'answered', "is_read", 'current_doctor_id',
    "notification_sent", 'is_filtered', 'is_auto', 'is_agent', 'agent_id', 'action_link','action_name',
    'only_doctor', 'is_urgent', 'forward_to_doctor', 'action_onetime', 'action_used', 'action_deadline',
    'only_patient', 'action_big', 'author_id', 'author_role', 'is_mailing', 'registrator_id', 'is_declined',
    'reply_to_id', 'action_type', 'is_warning']

messages.columns = heads
messages.sample(5)

Unnamed: 0,id,text,sent,deadline,is_doctor_message,contract_id,created_at,updated_at,is_answered,warning_sent,...,only_patient,action_big,author_id,author_role,is_mailing,registrator_id,is_declined,reply_to_id,action_type,is_warning
139450,190616,"Доброе утро. Я не знаю, видите ли Вы свой граф...",2021-04-24 05:45:50,2021-05-01 05:45:50,True,3611,2021-04-24 05:45:50,2021-12-31 18:10:34,True,False,...,False,False,4178,лечащий врач,False,,False,,action,False
206757,1651458,кровь и мочу я сдаю 24 числа.,2024-04-12 07:21:24,2024-04-13 07:21:24,False,12156,2024-04-12 07:21:24,2024-04-12 07:27:41,True,False,...,False,False,11725,Пациент,False,,False,,action,False
134055,35319,17 февраля,2020-02-06 07:39:50,2020-02-07 13:39:50,False,1396,2020-02-06 07:39:50,2020-10-12 12:22:41,True,False,...,False,False,1760,пациент,False,,False,,action,False
17047,288595,"Сегодня антибиотик добавили, завтра посмотрим",2021-09-13 19:28:31,2021-09-20 19:28:31,True,3767,2021-09-13 19:28:31,2022-02-27 17:01:37,True,False,...,False,False,4051,лечащий врач,False,,False,,action,False
17171,429197,Назначений врача не было.,2021-12-24 20:03:37,2021-12-25 20:03:37,False,4973,2021-12-24 20:03:37,2023-04-07 10:08:48,True,True,...,False,False,5365,Пациент,False,,False,,action,False


In [6]:
patient_messages = messages[(messages.is_doctor_message == False) & (messages.text != "Голосовое сообщение")][["id", "text"]]
patient_messages.dropna(inplace=True)
patient_messages.sample(5)

Unnamed: 0,id,text
99616,53032,Елена Игоревна здравствуйте Я нашла только Д3...
91721,77348,прикрепляю фото щитовидной железы
30080,527890,"А при таких результатах, есть ли шанс вылечить..."
22084,557827,Нам назначен сейчас витамин Д (длительно) и ко...
42771,1079264,"Добрый день, спасибо большое 🙏 если вдруг до 1..."


# Векторизация с помощью Navec

In [7]:
STOP_WORDS = set(stopwords.words("russian"))
HELLO_WORDS = set(['Доброе утро', "Здравствуйте", "Добрый день", "Добрый вечер"])
LEMMATIZER = Mystem()

def remove_punct(message: str) -> str:
    # удаляем знаки пунктуации
    message = message.translate(str.maketrans(' ', ' ', string.punctuation + "\n-«»№1234567890"))

    # удаляем лишние пробелы  и переведем все в нижний регистр
    message = " ".join(filter(lambda word: word != " ", message.split())).lower()
    return message

def remove_emoji(text: str) -> str:
    # удаляем емоджи
    return emoji.replace_emoji(text, replace="")

def remove_stopwords(message: str) -> str:
    # удаляем стоп слова
    message = ' '.join([word for word in word_tokenize(message) if word not in STOP_WORDS.union(HELLO_WORDS)])
    return message

def lemmatize_msg(message: str) -> str:
    # лемматизируем сообщения
    message = " ".join([LEMMATIZER.lemmatize(w)[0] for w in word_tokenize(message)])
    return message

def preproc(patient_messages):
    patient_messages.dropna(inplace=True)
    patient_messages.text = patient_messages.text.parallel_apply(remove_punct).parallel_apply(remove_emoji)
    patient_messages.text = patient_messages.text.parallel_apply(remove_stopwords)
    patient_messages = patient_messages[patient_messages.text != ""]
    patient_messages.text = patient_messages.text.parallel_apply(lemmatize_msg)
    return patient_messages

In [8]:
patient_messages_preproc = preproc(patient_messages)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11180), Label(value='0 / 11180')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11180), Label(value='0 / 11180')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11180), Label(value='0 / 11180')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10600), Label(value='0 / 10600')))…

In [9]:
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar
!pip install navec

In [10]:
from navec import Navec

path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

def text_2_vec(text):
    words = text.split()
    embeddings = []

    for word in words:
        try:
            vector = navec[word]
        except:
            vector = np.zeros(300)

        embeddings.append(vector)

    return np.array(embeddings).mean(axis=0)

In [11]:
df = pd.DataFrame({
  "id": patient_messages_preproc.id,
  "message": patient_messages_preproc.text.values,
  "message_emb":patient_messages_preproc.text.parallel_apply(text_2_vec)
})

emb_matrix = np.array(df.message_emb.to_list())

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10600), Label(value='0 / 10600')))…

In [12]:
from joblib import Parallel, delayed
from sklearn.metrics import silhouette_samples
from tqdm import tqdm

In [13]:
n_jobs = 8

def parallel_silhouette_samples(X, labels, metric='cosine', n_jobs=1):
    n_samples = X.shape[0]
    ranges = np.array_split(range(n_samples), n_jobs)

    silhouette_samples_pieces = Parallel(n_jobs=n_jobs)(
        delayed(silhouette_samples)(X[r], labels[r], metric=metric) for r in ranges
    )
    return np.concatenate(silhouette_samples_pieces).mean()

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import MiniBatchKMeans

silhouette_scores = []

for k in tqdm(range(5, 50, 5)):
    kmeans = MiniBatchKMeans(n_clusters=k, batch_size=1000, random_state=42).fit(emb_matrix)
    score = parallel_silhouette_samples(emb_matrix, kmeans.labels_, metric="cosine")
    silhouette_scores.append(score)

 56%|█████▌    | 5/9 [16:18<12:59, 194.84s/it]

In [None]:
plt.plot(range(5, 50, 5), silhouette_scores, 'bx-')
plt.xlabel('Кол-во кластеров')
plt.ylabel('Силуэтное расстояние')
plt.title('Силуэтный метод для эмбеддингов Navec')
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

optimal_k = 15

In [None]:
pca = PCA(50)
data_pca = pca.fit_transform(emb_matrix)

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42).fit(data_pca)

In [None]:
df["cluster"] = kmeans.labels_
df.head()

In [None]:
df[df.cluster == 0].sample(5)

# Векторизация с помощью transformers

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

# Загрузка предобученной модели и токенизатора
model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts):
    # Токенизация входных текстов
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Получение эмбеддингов с помощью модели
    with torch.no_grad():
        outputs = model(**inputs)

    # Emбеддинги находятся в outputs.last_hidden_state
    # Мы берем эмбеддинг [CLS] токена, который соответствует первому токену
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings[0]

In [None]:
patient_messages["embedding"] = patient_messages["text"].parallel_apply(get_embeddings)

In [None]:
emb_matrix = np.array(patient_messages.embedding.to_list())

In [None]:
# patient_messages_sample = patient_messages.sample(30000)

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import MiniBatchKMeans

silhouette_scores = []

for k in tqdm(range(5, 50, 5)):
    kmeans = MiniBatchKMeans(n_clusters=k, batch_size=1000, random_state=42).fit(emb_matrix)
    score = parallel_silhouette_samples(emb_matrix, kmeans.labels_, metric="cosine")
    silhouette_scores.append(score)

In [None]:
plt.plot(range(5, 50, 5), silhouette_scores, 'bx-')
plt.xlabel('Кол-во кластеров')
plt.ylabel('Силуэтное расстояние')
plt.title('Силуэтный метод для transformers')
plt.show()

In [None]:
optimal_k = 15

In [None]:
pca = PCA(50)
data_pca = pca.fit_transform(emb_matrix)

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42).fit(data_pca)

In [None]:
patient_messages["cluster"] = kmeans.labels_
patient_messages.head()

In [None]:
patient_messages[patient_messages.cluster == 10].sample(5)

In [None]:
np.array((sorted(patient_messages.cluster.unique())))