In [1]:
%pip install kagglehub -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import polars as pl
import torch
import transformers
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
import kagglehub
from pathlib import Path

In [3]:
path = Path(kagglehub.dataset_download('vfomenko/russian-news-2020'))
df = pl.read_csv(path / "news.csv").select("text", "publication_date")

print(df.head())

shape: (5, 2)
┌─────────────────────────────────┬───────────────────────────┐
│ text                            ┆ publication_date          │
│ ---                             ┆ ---                       │
│ str                             ┆ str                       │
╞═════════════════════════════════╪═══════════════════════════╡
│ В 1930-е годы Советский Союз о… ┆ 2020-08-30T00:01:00+03:00 │
│ Олимпийская чемпионка по фигур… ┆ 2020-08-31T20:04:00+03:00 │
│ Российский врач-диетолог Римма… ┆ 2020-08-31T20:07:00+03:00 │
│ В 2019 году телеканал «Ю» запу… ┆ 2020-08-30T00:04:00+03:00 │
│ Актер  Михаил Ефремов  система… ┆ 2020-08-31T18:27:00+03:00 │
└─────────────────────────────────┴───────────────────────────┘


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('deepvk/RuModernBERT-base', attn_implementation="flash_attention_2")
model = AutoModel.from_pretrained('deepvk/RuModernBERT-base').to(device)
model.eval()

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

2025-05-29 20:26:18.441088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748550378.653199      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748550378.716487      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=768, out_features=768, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=768, out_features=2304, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=1152, out_features=768, bias=False)
      )
    )
    (1-21): 21 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((768,), eps=1e-05, e

In [8]:
import torch.nn as nn

model = nn.DataParallel(model)

In [9]:
texts = df['text'].to_list()
timestamps = df['publication_date']
embeddings = []

batch_size = 4
for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=8192,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings.tolist())

  0%|          | 0/5419 [00:00<?, ?it/s]

In [10]:
result_df = pl.DataFrame({
    'text': texts,
    'timestamp': timestamps,
    'embedding': embeddings,
})

print(result_df.head())

shape: (5, 3)
┌─────────────────────────────────┬───────────────────────────┬─────────────────────────────────┐
│ text                            ┆ timestamp                 ┆ embedding                       │
│ ---                             ┆ ---                       ┆ ---                             │
│ str                             ┆ str                       ┆ list[f64]                       │
╞═════════════════════════════════╪═══════════════════════════╪═════════════════════════════════╡
│ В 1930-е годы Советский Союз о… ┆ 2020-08-30T00:01:00+03:00 ┆ [1.074591, -1.174109, … -0.412… │
│ Олимпийская чемпионка по фигур… ┆ 2020-08-31T20:04:00+03:00 ┆ [-0.124826, -0.899925, … -0.11… │
│ Российский врач-диетолог Римма… ┆ 2020-08-31T20:07:00+03:00 ┆ [0.705892, -1.752971, … -0.415… │
│ В 2019 году телеканал «Ю» запу… ┆ 2020-08-30T00:04:00+03:00 ┆ [0.944776, -1.135635, … -0.155… │
│ Актер  Михаил Ефремов  система… ┆ 2020-08-31T18:27:00+03:00 ┆ [0.129445, -0.584635, … 0.0956… │
└─────

In [11]:
result_df.write_parquet('russian_news_embeddings.parquet')