In [None]:
!pip install umap-learn
!pip install hdbscan

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
import hdbscan

import torch.nn.functional as F
import torch

from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader, TensorDataset

from tqdm import tqdm

import umap

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
table_path = "/kaggle/input/full-processed-dataset/full_processed_dataset.csv"
data = pd.read_csv(table_path)
data.dropna(axis=0, how='any', inplace=True)
data.parsed_text = data.parsed_text.astype(str)

data.head()

In [None]:
lol = []
for i in range(data.shape[0]):
    if data.loc[i, 'parsed_text'].find('Страница входа Восстановить Зарегистрироваться') != -1:
        lol.append(i)
print(lol)

In [None]:
data.loc[lol, :]

In [None]:
data.drop(index=lol, inplace=True)

In [None]:
data['base_category_nm'].unique()

In [None]:
def draw_wordcloud(texts, max_words=1000, width=1000, height=500):
    wordcloud = WordCloud(background_color='white', max_words=max_words,
                          width=width, height=height)
    
    joint_texts = ' '.join(list(texts))
    wordcloud.generate(joint_texts)
    return wordcloud.to_image()

In [None]:
draw_wordcloud(data[data["base_category_nm"] == 'Образование']["parsed_text"])

In [None]:
print('общий размер датасета', data.shape[0])
print('количество элементов не из образования', data[data['base_category_nm'] != 'Образование']['parsed_text'].count())
print('количество элементов из образования', data[data['base_category_nm'] == 'Образование']['parsed_text'].count())
print('максимальный размер строки в датасете', max(len(i) for i in data['parsed_text']))

In [None]:
plt.hist([len(i) for i in data['parsed_text']])
plt.show()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)

In [None]:
data['text'] = data['parsed_text'].apply(lambda x: "query: " + x)
data.head()

In [None]:
dataset = Dataset.from_pandas(data[data["base_category_nm"] == 'Образование'])

In [None]:
%%time

TEST_SIZE = 0.3
SPLIT_RANDOM_SEED = 42
MAX_LENGTH = 512

# batch_dict = dataset.map(encode, batched=True, remove_columns="text")
batch_dict = tokenizer(dataset["text"], max_length=512, padding=True, truncation=True, return_tensors='pt')

trans_dataset = TensorDataset(batch_dict['input_ids'], batch_dict['attention_mask'])

In [None]:
batch_size = 1
data_loader = DataLoader(trans_dataset, batch_size=batch_size, shuffle=False)

In [None]:
%%time

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

model.eval()
result = None
for input_ids, attention_mask in tqdm(data_loader, desc=f'Trans'):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    
    out = model(input_ids, attention_mask=attention_mask)
    embeddings = average_pool(out.last_hidden_state, attention_mask)
    
    res = embeddings.cpu().detach().numpy().astype("float")
    if result is None:
        result = res.copy()
    else:
        result = np.append(result, res, axis=0)

In [None]:
result.shape

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
kmeans = KMeans(n_clusters=4).fit(result)

In [None]:
draw_wordcloud(data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 0].parsed_text)

In [None]:
data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 0]

In [None]:
draw_wordcloud(data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 1].parsed_text)

In [None]:
data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 1]

In [None]:
draw_wordcloud(data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 2].parsed_text)

In [None]:
data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 2]

In [None]:
draw_wordcloud(data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 3].parsed_text)

In [None]:
data[data["base_category_nm"] == 'Образование'][kmeans.labels_ == 3]

In [None]:
scaled_penguin_data = StandardScaler().fit_transform(result)

In [None]:
%%time

reducer = umap.UMAP()
embedding = reducer.fit_transform(scaled_penguin_data)
embedding.shape

In [None]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[sns.color_palette()[x] for x in kmeans.labels_])

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature', fontsize=24);

In [None]:
dbscan = DBSCAN(eps=0.05).fit(result)

In [None]:
set(dbscan.labels_)

In [None]:
embedding[dbscan.labels_ == 0]

In [None]:
# 0.5
fig, ax = plt.subplots()
for lbl in np.arange(-1, 6):
    ax.scatter(
        embedding[dbscan.labels_ == lbl][:, 0],
        embedding[dbscan.labels_ == lbl][:, 1],
        c=sns.color_palette()[lbl],
        label=lbl)

ax.legend()
ax.grid(True)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature DBSCAN', fontsize=24)

plt.show()

In [None]:
# 0.05
fig, ax = plt.subplots()
for lbl in np.arange(-1, 6):
    ax.scatter(
        embedding[dbscan.labels_ == lbl][:, 0],
        embedding[dbscan.labels_ == lbl][:, 1],
        c=sns.color_palette()[lbl],
        label=lbl)

ax.legend()
ax.grid(True)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature DBSCAN', fontsize=24)

plt.show()

In [None]:
# 5
fig, ax = plt.subplots()
for lbl in np.arange(-1, 6):
    ax.scatter(
        embedding[dbscan.labels_ == lbl][:, 0],
        embedding[dbscan.labels_ == lbl][:, 1],
        c=sns.color_palette()[lbl],
        label=lbl)

ax.legend()
ax.grid(True)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature DBSCAN', fontsize=24)

plt.show()

In [None]:
from sklearn.decomposition import PCA

kek = PCA(n_components=10).fit_transform(result)

In [None]:
spectr = SpectralClustering(n_jobs=-1).fit(result[:1000])

In [None]:
set(spectr.labels_)

In [None]:
from collections import Counter
lol = Counter(spectr.labels_)
lol

In [None]:
fig, ax = plt.subplots()
for lbl in np.arange(0, 8):
    ax.scatter(
        embedding[:1000][spectr.labels_ == lbl][:, 0],
        embedding[:1000][spectr.labels_ == lbl][:, 1],
        c=sns.color_palette()[lbl],
        label=lbl)

ax.legend()
ax.grid(True)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature SPECTR', fontsize=24)

plt.show()

In [None]:
data[data["base_category_nm"] == 'Образование'][:1000][spectr.labels_ == 7]

In [None]:
hdbscan_model = hdbscan.HDBSCAN(cluster_selection_epsilon=5).fit(result)

In [None]:
from collections import Counter
lol = Counter(hdbscan_model.labels_)
lol

In [None]:
set(hdbscan_model.labels_)

In [None]:
data['ЕГЭ'] = data['text'].apply(lambda x: (x.find('ЕГЭ') != -1) or (x.find('егэ') != -1))

In [None]:
new_data = data[data["base_category_nm"] == 'Образование']
new_data['cluster'] = hdbscan_model.labels_

In [None]:
new_data[new_data['ЕГЭ'] == True]['cluster'].value_counts()

In [None]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

ege_points = embedding[(new_data['ЕГЭ'] == True)]

In [None]:
ege_distances = euclidean_distances(ege_points, ege_points)

mean_ege_distance = np.mean(ege_distances)
print(f"Среднее расстояние внутри группы ЕГЭ: {mean_ege_distance}")

In [None]:
unique_labels = new_data['cluster'].unique()

mean_distances_within_groups = {}
for lbl in unique_labels:
    if lbl == -1:
        continue
    group_points = embedding[(new_data['cluster'] == lbl)]
    group_distances = euclidean_distances(group_points, group_points)
    mean_distance = np.mean(group_distances)
    mean_distances_within_groups[lbl] = mean_distance

for lbl, mean_distance in mean_distances_within_groups.items():
    print(f"Среднее расстояние внутри группы {lbl}: {mean_distance}")

In [None]:
mean_distances_between_groups = {}

for lbl1 in unique_labels:
    if lbl1 == -1:
        continue
    for lbl2 in unique_labels:
        if lbl2 == -1 or lbl1 >= lbl2:
            continue
        group1_points = embedding[(new_data['cluster'] == lbl1)]
        group2_points = embedding[(new_data['cluster'] == lbl2)]
        distances_between_groups = euclidean_distances(group1_points, group2_points)
        mean_distance = np.mean(distances_between_groups)
        mean_distances_between_groups[(lbl1, lbl2)] = mean_distance

for (lbl1, lbl2), mean_distance in mean_distances_between_groups.items():
    print(f"Среднее расстояние между группами {lbl1} и {lbl2}: {mean_distance}")

In [None]:
fig, ax = plt.subplots()
for lbl in unique_labels:
    if lbl == -1:
        continue
    ax.scatter(
        embedding[new_data['cluster'] == lbl][:, 0],
        embedding[new_data['cluster'] == lbl][:, 1],
        label=f'Кластер {lbl}'
    )

ax.scatter(
    ege_points[:, 0],
    ege_points[:, 1],
    c='red',
    label='ЕГЭ'
)

ax.legend()
ax.grid(True)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature', fontsize=24)
plt.show()

In [None]:
new_data['cluster'].unique()

In [None]:
new_data.to_csv('education.csv')

In [None]:
colors = ['#000000', '#ff0000', '#00ff00', '#0000ff', '#ff9900', '#99cc00',
'#cc00cc', '#ffcc00', '#33ccff', '#009900', '#cc99cc', '#ff3300',
'#66ffcc', '#99ffcc', '#ffcc99', '#ff9999', '#cccccc', '#993366',
'#ffcc66', '#669999', '#ff6600', '#6633ff', '#9966ff', '#6699cc',
'#339933']

In [None]:
fig, ax = plt.subplots()
for lbl in [-1, 20, 21, 6]:
    ax.scatter(
        embedding[(new_data['ЕГЭ'] == True) & (new_data['cluster'] == lbl)][:, 0],
        embedding[(new_data['ЕГЭ'] == True) & (new_data['cluster'] == lbl)][:, 1],
        c=colors[lbl],
        label=lbl)

ax.legend()
ax.grid(True)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature HDBSCAN', fontsize=24)

plt.show()

In [None]:
fig, ax = plt.subplots()
for lbl in np.arange(-1, 22):
    ax.scatter(
        embedding[hdbscan_model.labels_ == lbl][:, 0],
        embedding[hdbscan_model.labels_ == lbl][:, 1],
        c=colors[lbl],
        label=lbl)

ax.legend()
ax.grid(True)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature HDBSCAN', fontsize=24)

plt.show()

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == 21]

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == 20]

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == 17]

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == 1]

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == 8]

In [None]:
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=2).fit(result)

In [None]:
from collections import Counter
lol = Counter(hdbscan_model.labels_)
lol

In [None]:
set(hdbscan_model.labels_)

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == 164]

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == 134]

In [None]:
data[data["base_category_nm"] == 'Образование'][hdbscan_model.labels_ == -1]

In [None]:
fig, ax = plt.subplots()
for lbl in np.arange(-1, 22):
    ax.scatter(
        embedding[hdbscan_model.labels_ == lbl][:, 0],
        embedding[hdbscan_model.labels_ == lbl][:, 1],
        c=colors[lbl],
        label=lbl)

ax.legend()
ax.grid(True)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Education feature HDBSCAN', fontsize=24)

plt.show()