In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("preprocessed_training.tsv", sep='\t')
df = df.drop(columns=["Unnamed: 0"])
df.head(10)

Unnamed: 0,id,amount,preprocessed_text,category
0,1,15300.0,участие конференция майкоп договор,SERVICE
1,2,40200.0,оказание услуга договор,SERVICE
2,3,1440.0,оплата порошок стиральный ariel color automat ...,NON_FOOD_GOODS
3,4,240000000.0,возврат денежный средство договор заём ндс,LOAN
4,5,1360000.0,оплата дог соглый оплата сброс загрязнять веще...,NOT_CLASSIFIED
5,6,1820000.0,оплата дог финансовый аренда акт приём-передач...,LEASING
6,7,4900.0,оплата мицеллярный вода чистый линия цветочный...,NON_FOOD_GOODS
7,8,3250.0,оплата стиральный порошок счёт,NON_FOOD_GOODS
8,9,5000.0,оплата договор счёт мясной деликатес ндс,FOOD_GOODS
9,10,1840000.0,оплата договор процентный заём ндс,LOAN


In [None]:
df.shape

(500, 4)

In [None]:
df_pseudo = pd.read_csv("pseudo_labeled_data_main.tsv", sep='\t')
df_pseudo.head(10)

Unnamed: 0,id,amount,preprocessed_text,pseudo_labels
0,1,40500.0,тур поездка договор,SERVICE
1,2,32600.0,оказание услуга договор,SERVICE
2,3,4710.0,оплата штраф,NOT_CLASSIFIED
3,4,30900.0,лечение договор,SERVICE
4,5,13200.0,оплата основный долг период договор оао второй...,LOAN
5,6,4210.0,оплата бульон роллтон домашний куриный 90г счёт,FOOD_GOODS
6,7,4240.0,комиссионный вознаграждение валютный перевод,BANK_SERVICE
7,8,4630.0,государственный пошлина,TAX
8,9,8000.0,лечение договор,SERVICE
9,10,1310000.0,оплата счёт рамка договор финансовый аренда ндс,LEASING


In [None]:
df_pseudo.shape

(25000, 4)

In [None]:
df_pseudo['pseudo_labels'].isna().sum()

11291

In [None]:
df_pseudo = df_pseudo[df_pseudo['pseudo_labels'].notna()]
df_pseudo.shape

(13709, 4)

In [None]:
df_pseudo = df_pseudo.rename(columns={'pseudo_labels': 'category'})

df_train = pd.concat([df, df_pseudo], ignore_index=True)

df_train.shape

(14209, 4)

In [None]:
df_train['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
SERVICE,3019
FOOD_GOODS,2622
NON_FOOD_GOODS,2083
LOAN,1555
NOT_CLASSIFIED,1251
LEASING,1220
REALE_STATE,996
TAX,792
BANK_SERVICE,671


## Создание эмбеддингов текстового столбца

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Определяем Dataset
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# 2. Инициализация токенизатора и модели
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny')
model_rubert = AutoModel.from_pretrained('cointegrated/rubert-tiny').to(device)

texts = df_train['preprocessed_text']

# 3. Создаём Dataset и DataLoader
text_dataset = TextDataset(texts)
batch_size = 512
data_loader = DataLoader(text_dataset, batch_size=batch_size, shuffle=False)

# 4. Обработка данных батчами
embeddings = []

model_rubert.eval()
with torch.no_grad():
    for batch in tqdm(data_loader, desc="Processing batches", total=len(data_loader), ncols=100):
        encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

        input_ids = encoded_inputs['input_ids'].to(device)
        attention_mask = encoded_inputs['attention_mask'].to(device)

        outputs = model_rubert(input_ids=input_ids, attention_mask=attention_mask)

        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embeddings)

final_embeddings = torch.cat(embeddings, dim=0)

print(final_embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/468k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

Processing batches: 100%|███████████████████████████████████████████| 28/28 [00:02<00:00, 12.50it/s]

torch.Size([14209, 312])





In [None]:
final_embeddings_list = [emb.cpu().numpy() for emb in final_embeddings]

df_train['text_embed'] = final_embeddings_list

df_train[['preprocessed_text', 'text_embed']].head()

Unnamed: 0,preprocessed_text,text_embed
0,участие конференция майкоп договор,"[-0.3839216, 0.029149303, 0.041437216, 0.09084..."
1,оказание услуга договор,"[-0.73675907, -0.35754037, -0.26379448, -0.297..."
2,оплата порошок стиральный ariel color automat ...,"[-0.012440931, -0.09139417, -0.20764157, -0.69..."
3,возврат денежный средство договор заём ндс,"[-1.0594479, -0.52556884, -0.5544399, -0.36923..."
4,оплата дог соглый оплата сброс загрязнять веще...,"[-0.2358977, -0.50058454, -0.5359852, -0.76608..."


In [None]:
print(type(df_train['text_embed'][0]))
print(type(df_train['text_embed'][0][0]))

<class 'numpy.ndarray'>
<class 'numpy.float32'>


## Нормализация amount

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df_train['amount_normalized'] = scaler.fit_transform(df_train[['amount']])

df_train[['amount', 'amount_normalized']].head()

Unnamed: 0,amount,amount_normalized
0,15300.0,3e-05
1,40200.0,7.9e-05
2,1440.0,2e-06
3,240000000.0,0.479999
4,1360000.0,0.002719


## Объединение данных

In [None]:
import torch.nn as nn
import numpy as np

hidden_size = 768

class AmountProcessor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(AmountProcessor, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.linear(x)
        x = self.activation(x)
        return x

amount_processor = AmountProcessor(input_dim=1, output_dim=hidden_size)

text_embeddings_array = np.stack(df_train['text_embed'].values)  # [num_samples, hidden_size]
amount_normalized_array = df_train['amount_normalized'].values.reshape(-1, 1)  # [num_samples, 1]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
amount_processor = amount_processor.to(device)

text_embeddings_tensor = torch.tensor(text_embeddings_array, dtype=torch.float32).to(device)
amount_normalized_tensor = torch.tensor(amount_normalized_array, dtype=torch.float32).to(device)

processed_amount = amount_processor(amount_normalized_tensor)  # [num_samples, hidden_size]

combined_features = torch.cat((text_embeddings_tensor, processed_amount), dim=1)  # [num_samples, 2 * hidden_size]

print("Размер объединённых признаков:", combined_features.shape)


Размер объединённых признаков: torch.Size([14209, 1080])


In [None]:
combined_features_cpu = combined_features.cpu().detach().numpy()
combined_features_list = [feature.tolist() for feature in combined_features_cpu]

df_train['combined_features'] = combined_features_list

df_train[['preprocessed_text', 'text_embed', 'amount_normalized', 'combined_features']].head(1)

Unnamed: 0,preprocessed_text,text_embed,amount_normalized,combined_features
0,участие конференция майкоп договор,"[-0.3839216, 0.029149303, 0.041437216, 0.09084...",3e-05,"[-0.3839215934276581, 0.02914930321276188, 0.0..."


## Создание обучающей архитектуры

### 1. Подготовка данных

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

In [None]:
X = np.stack(df_train['combined_features'].values)
y = df_train['category'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

joblib.dump(label_encoder, 'label_encoder.pkl')

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.15, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

print("Размер тренировочных данных:", X_train_tensor.shape, y_train_tensor.shape)
print("Размер валидационных данных:", X_val_tensor.shape, y_val_tensor.shape)

Размер тренировочных данных: torch.Size([12077, 1080]) torch.Size([12077])
Размер валидационных данных: torch.Size([2132, 1080]) torch.Size([2132])


In [None]:
labels_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print(labels_mapping.keys())

dict_keys(['BANK_SERVICE', 'FOOD_GOODS', 'LEASING', 'LOAN', 'NON_FOOD_GOODS', 'NOT_CLASSIFIED', 'REALE_STATE', 'SERVICE', 'TAX'])


### 2. Настройка модели классификации

In [None]:
class ClassificationModel(nn.Module):
    def __init__(self, input_dim, hidden_dims, num_classes, dropout_prob=0.3):
        super(ClassificationModel, self).__init__()
        layers = []

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_prob))
            input_dim = hidden_dim

        layers.append(nn.Linear(hidden_dims[-1], num_classes))

        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

In [None]:
import torch.optim as optim

input_dim = X_train_tensor.shape[1]
hidden_dims = [1024, 512, 256]
num_classes = len(label_encoder.classes_)

model = ClassificationModel(input_dim=input_dim, hidden_dims=hidden_dims, num_classes=num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

print(model)
print(f"Функция потерь: {criterion}")
print(f"Оптимизатор: {optimizer}")

ClassificationModel(
  (net): Sequential(
    (0): Linear(in_features=1080, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_features=256, out_features=9, bias=True)
  )
)
Функция потерь: CrossEntropyLoss()
Оптимизатор: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


### 3. Настройка DataLoader

In [None]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f"Количество батчей в обучении: {len(train_loader)}")
print(f"Количество батчей валидации: {len(val_loader)}")

Количество батчей в обучении: 378
Количество батчей валидации: 34


### 4. Реализация цикла обучения

In [None]:
def train_one_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()

        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(y_batch).sum().item()
        total += y_batch.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    return avg_loss, accuracy

def validate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(y_batch).sum().item()
            total += y_batch.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total
    return avg_loss, accuracy

### 5. Обучение модели

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)

    print(f"Эпоха {epoch+1}/{num_epochs}")
    print(f"  Обучение    - Потери: {train_loss:.4f}, Точность: {train_acc:.4f}")
    print(f"  Валидация   - Потери: {val_loss:.4f}, Точность: {val_acc:.4f}")

Эпоха 1/10
  Обучение    - Потери: 0.8739, Точность: 0.6759
  Валидация   - Потери: 0.4116, Точность: 0.8640
Эпоха 2/10
  Обучение    - Потери: 0.3682, Точность: 0.8709
  Валидация   - Потери: 0.2683, Точность: 0.9142
Эпоха 3/10
  Обучение    - Потери: 0.2895, Точность: 0.8972
  Валидация   - Потери: 0.2489, Точность: 0.9259
Эпоха 4/10
  Обучение    - Потери: 0.2491, Точность: 0.9131
  Валидация   - Потери: 0.2254, Точность: 0.9296
Эпоха 5/10
  Обучение    - Потери: 0.2448, Точность: 0.9161
  Валидация   - Потери: 0.1975, Точность: 0.9301
Эпоха 6/10
  Обучение    - Потери: 0.2311, Точность: 0.9184
  Валидация   - Потери: 0.1874, Точность: 0.9367
Эпоха 7/10
  Обучение    - Потери: 0.2265, Точность: 0.9223
  Валидация   - Потери: 0.2428, Точность: 0.9071
Эпоха 8/10
  Обучение    - Потери: 0.2297, Точность: 0.9221
  Валидация   - Потери: 0.2094, Точность: 0.9235
Эпоха 9/10
  Обучение    - Потери: 0.2201, Точность: 0.9222
  Валидация   - Потери: 0.2168, Точность: 0.9278
Эпоха 10/10
  Обуче

In [None]:
# Сохраняем только параметры модели
torch.save(model.state_dict(), 'model.pth')

# Сохраняем модель целиком (включая архитектуру и параметры)
# torch.save(model, 'model_full.pth')