In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn import model_selection, ensemble, metrics, cluster
import umap

In [None]:
from functools import partial

import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})

    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()


def rubert_embeddings(s: pd.Series) -> np.ndarray:
    return np.vstack(s.map(partial(embed_bert_cls, model=model, tokenizer=tokenizer)).values)


def umap_embeddings(X: np.ndarray):
    normalized_embeddings = (X - X.mean(0)) / X.std(0)
    return umap.UMAP(
        random_state=100,
        n_neighbors=10,
        n_components=3,
        n_jobs=1,
        metric="cosine",
        min_dist=0.1,
    ).fit_transform(normalized_embeddings)


def optics_clusterize(X: np.ndarray) -> np.ndarray:
    normalized_embeddings = (X - X.mean(0)) / X.std(0)
    return cluster.OPTICS(metric="euclidean").fit(normalized_embeddings).labels_

In [None]:
# example usage: df_train_data can be obtained like this (just patch csv paths):
df_train_groups = pd.read_csv("/kaggle/input/web-document-analysis/train_groups.csv")
df_test_groups = pd.read_csv("/kaggle/input/web-document-analysis/test_groups.csv")
df_docs_titles = pd.read_csv("/kaggle/input/web-document-analysis/docs_titles.tsv", delimiter="\t")

df_train_data = df_train_groups.merge(
    df_docs_titles,
    on="doc_id",
    how="inner",
)

In [None]:
df_train_samples = df_train_data[df_train_data.group_id == 66].copy()

rubert_embeddings_train = rubert_embeddings(df_train_samples["title"])
umap_embeddings_train = umap_embeddings(rubert_embeddings_train)
optics_labels_train = optics_clusterize(rubert_embeddings_train)

df_train_samples["cluster_label"] = optics_labels_train
df_train_samples["cluster_label"] = df_train_samples["cluster_label"].map(str)

df_train_samples["x"] = umap_embeddings_train[:, 0]
df_train_samples["y"] = umap_embeddings_train[:, 1]
df_train_samples["z"] = umap_embeddings_train[:, 2]
df_train_samples["target_name"] = df_train_samples["target"].map(str)

px.scatter_3d(
    df_train_samples,
    x="x",
    y="y",
    z="z",
    symbol="target_name",
    color="cluster_label",
    template="plotly_dark",
    hover_data="title",
).show()