In [None]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import re
import sqlite3
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('stopwords')

# Параметры
model_name = 'DeepPavlov/rubert-base-cased'
batch_size = 16
epochs = 3
learning_rate = 2e-5
max_length = 200  # Максимальная длина текста

# Проверка доступности GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device.type)

# Загрузка токенизатора и модели
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=100)
model = model.to(device)

russian_stopwords = set(stopwords.words('russian'))
english_stopwords = set(stopwords.words('english'))

# Функции для предобработки текста
def clean_text_from_unnecessary_symbols(text):
    text = re.sub(r'[^а-яА-Яa-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

def remove_stop_words(text):
    words = nltk.word_tokenize(text)
    return ' '.join([word for word in words if word.lower() not in russian_stopwords and word.lower() not in english_stopwords])

def clean_text_for_model(text):
    text = remove_stop_words(text)
    text = remove_urls(text)
    text = clean_text_from_unnecessary_symbols(text)
    return text

# Загрузка и предобработка данных
conn = sqlite3.connect('yt_videos_for_train_labeled.db')
videos_df = pd.read_sql('SELECT * FROM videos', conn)
videos_df['text'] = videos_df['video_title'].astype(str) + " " + videos_df['video_description'].astype(str)
videos_df['text'] = videos_df['text'].apply(clean_text_for_model)

X = videos_df['text']
y = videos_df['profession']

# Разделение данных и кодирование меток
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)

# Токенизация текстов
def encode_texts(texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

train_inputs, train_masks = encode_texts(train_texts)
val_inputs, val_masks = encode_texts(val_texts)

# Создание DataLoader'ов
train_data = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(val_inputs, val_masks, torch.tensor(val_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Оптимизатор и планировщик
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Функция обучения модели
def train(model, train_dataloader):
    model.train()
    for epoch in range(epochs):
        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

# Функция оценки модели
def evaluate(model, validation_dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in validation_dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions.extend(np.argmax(logits, axis=1).flatten())
            true_labels.extend(label_ids.flatten())

    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions, target_names=label_encoder.classes_)

# Обучение и оценка модели
train(model, train_dataloader)
accuracy, report = evaluate(model, validation_dataloader)
print("Accuracy:", accuracy)
print(report)

# Сохранение модели и токенизатора
model_path = 'text_model.pth'
tokenizer_path = 'tokenizer/'

torch.save(model.state_dict(), model_path)
if not os.path.exists(tokenizer_path):
    os.makedirs(tokenizer_path)
tokenizer.save_pretrained(tokenizer_path)
