In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


# Разбиение по датасетам

In [None]:
import re
import emoji
import pandas as pd

def clean_text(text):
    text = str(text).lower()
    text = emoji.replace_emoji(text, replace=' ')
    text = re.sub(r'[!"№;%:?*()\-=\/\\|@#$^&{}\[\]\'.,~`\n\r\t]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def is_english(text):
    return bool(re.fullmatch(r"[A-Za-z0-9\s]+", text))

def count_by_rating(df, name):
    print(f"\n{name}")
    print(df['rating'].value_counts().sort_index())

def remove_duplicates_between_datasets(df1, df2):
    common_texts = df1['text'][df1['text'].isin(df2['text'])]
    df1 = df1[~df1['text'].isin(common_texts)]
    df2 = df2[~df2['text'].isin(common_texts)]
    return df1, df2

file_path_ru = "/content/drive/MyDrive/Датасеты/RuReviews.csv"
labels = {"positive": 5, "negative": 1, "neautral": 3}
data_ru = []

with open(file_path_ru, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        words = line.split()
        if words and words[-1] in labels:
            rating = labels[words[-1]]
            text = " ".join(words[:-1])
            data_ru.append([rating, text])

ru = pd.DataFrame(data_ru, columns=["rating", "text"])
ru["text"] = ru["text"].astype(str).apply(clean_text)
ru = ru[ru["text"].str.strip() != ""]
ru = ru[~ru["text"].apply(is_english)]
ru = ru[ru["text"].str.split().str.len() > 1]
ru = ru.drop_duplicates(subset='text', keep='first')

wb = pd.read_csv('/content/drive/MyDrive/Датасеты/WB.csv')
wb['text'] = wb['text'].astype(str).apply(clean_text)
wb = wb[~wb["text"].apply(is_english)]
wb = wb[wb["text"].str.split().str.len() > 3]
wb = wb.dropna(subset=['rating', 'text'])
wb = wb.drop_duplicates(subset='text', keep='first')

ru, wb = remove_duplicates_between_datasets(ru, wb)

count_by_rating(ru, "RuReviews после 1 очистки")
count_by_rating(wb, "WB после 1 очистки")
###########################################################################################
test_pos = ru[ru["rating"] == 5].head(7000)
test_neu = ru[ru["rating"] == 3].head(7000)
test_neg = ru[ru["rating"] == 1].head(7000)

test_df1 = pd.concat([test_pos, test_neu, test_neg], ignore_index=True)
test_df1 = test_df1.sample(frac=1, random_state=42).reset_index(drop=True)
test_df1 = test_df1.drop_duplicates(subset='text', keep='first')
test_df1.to_csv("/content/drive/MyDrive/Датасеты/RuReviews_test_21000.csv", index=False)

ru = ru.drop(test_pos.index)
ru = ru.drop(test_neu.index)
ru = ru.drop(test_neg.index)
count_by_rating(ru, "RuReviews после 2 очистки")
###########################################################################################
test_pos_ru = ru[ru["rating"] == 5].head(3500)
test_neu_ru = ru[ru["rating"] == 3].head(3500)
test_neg_ru = ru[ru["rating"] == 1].head(3500)
test_pos_wb = wb[wb["rating"] == 5].head(3500)
test_neu_wb = wb[wb["rating"] == 3].head(3500)
test_neg_wb = wb[wb["rating"] == 1].head(3500)

test_df2 = pd.concat([
    test_pos_ru, test_neu_ru, test_neg_ru,
    test_pos_wb, test_neu_wb, test_neg_wb
], ignore_index=True)

test_df2 = test_df2.sample(frac=1, random_state=42).reset_index(drop=True)
test_df2 = test_df2.drop_duplicates(subset='text', keep='first')
test_df2.to_csv("/content/drive/MyDrive/Датасеты/RuWB_test_21000.csv", index=False)
wb = wb.drop(test_pos_wb.index)
wb = wb.drop(test_neu_wb.index)
wb = wb.drop(test_neg_wb.index)
ru = ru.drop(test_pos_ru.index)
ru = ru.drop(test_neu_ru.index)
ru = ru.drop(test_neg_ru.index)
count_by_rating(ru, "RuReviews после 3 очистки")
count_by_rating(wb, "WB после 2 очистки")
###########################################################################################
val_pos_ru = ru[ru["rating"] == 5].head(3500)
val_neu_ru = ru[ru["rating"] == 3].head(3500)
val_neg_ru = ru[ru["rating"] == 1].head(3500)
val_pos_wb = wb[wb["rating"] == 5].head(3500)
val_neu_wb = wb[wb["rating"] == 3].head(3500)
val_neg_wb = wb[wb["rating"] == 1].head(3500)

val_df = pd.concat([
    val_pos_ru, val_neu_ru, val_neg_ru,
    val_pos_wb, val_neu_wb, val_neg_wb
], ignore_index=True)

val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)
val_df = val_df.drop_duplicates(subset='text', keep='first')
val_df.to_csv("/content/drive/MyDrive/Датасеты/RuWB_validation_21000.csv", index=False)
wb = wb.drop(val_pos_wb.index)
wb = wb.drop(val_neu_wb.index)
wb = wb.drop(val_neg_wb.index)
ru = ru.drop(val_pos_ru.index)
ru = ru.drop(val_neu_ru.index)
ru = ru.drop(val_neg_ru.index)
count_by_rating(ru, "RuReviews после 4 очистки")
count_by_rating(wb, "WB после 3 очистки")
###########################################################################################
train_pos_ru = ru[ru["rating"] == 5]
train_neu_ru = ru[ru["rating"] == 3]
train_neg_ru = ru[ru["rating"] == 1]

pos_deficit = 56000 - len(train_pos_ru)
neu_deficit = 56000 - len(train_neu_ru)
neg_deficit = 56000 - len(train_neg_ru)

train_pos_wb = wb[wb["rating"] == 5].head(pos_deficit)
train_neu_wb = wb[wb["rating"] == 3].head(neu_deficit)
train_neg_wb = wb[wb["rating"] == 1].head(neg_deficit)

train_df4 = pd.concat([
    train_pos_ru, train_neu_ru, train_neg_ru,
    train_pos_wb, train_neu_wb, train_neg_wb
], ignore_index=True)

train_df4 = train_df4.sample(frac=1, random_state=42).reset_index(drop=True)
train_df4 = train_df4.drop_duplicates(subset='text', keep='first')
train_df4.to_csv("/content/drive/MyDrive/Датасеты/RuWB_train_balanced.csv", index=False)
wb = wb.drop(train_pos_wb.index)
wb = wb.drop(train_neu_wb.index)
wb = wb.drop(train_neg_wb.index)
ru = ru.drop(train_pos_ru.index)
ru = ru.drop(train_neu_ru.index)
ru = ru.drop(train_neg_ru.index)
count_by_rating(ru, "RuReviews после 5 очистки")
count_by_rating(wb, "WB после 4 очистки")
###########################################################################################
train_df_35 = train_df4.copy()
neutral_add_35 = wb[wb["rating"] == 3].head(19600)
train_df_35 = pd.concat([train_df_35, neutral_add_35], ignore_index=True)
train_df_35 = train_df_35.sample(frac=1, random_state=42).reset_index(drop=True)
train_df_35 = train_df_35.drop_duplicates(subset='text', keep='first')
train_df_35.to_csv("/content/drive/MyDrive/Датасеты/RuWB_train_35.csv", index=False)
wb = wb.drop(neutral_add_35.index)
count_by_rating(wb, "WB после 5 очистки")
###########################################################################################
train_df_55 = train_df_35.copy()
neutral_add_55 = wb[wb["rating"] == 3].head(11200)
train_df_55 = pd.concat([train_df_55, neutral_add_55], ignore_index=True)
train_df_55 = train_df_55.sample(frac=1, random_state=42).reset_index(drop=True)
train_df_55 = train_df_55.drop_duplicates(subset='text', keep='first')
train_df_55.to_csv("/content/drive/MyDrive/Датасеты/RuWB_train_55.csv", index=False)
wb = wb.drop(neutral_add_55.index)
count_by_rating(wb, "WB после 6 очистки")
###########################################################################################
train_df_75 = train_df_55.copy()
neutral_add_75 = wb[wb["rating"] == 3].head(11200)
train_df_75 = pd.concat([train_df_75, neutral_add_75], ignore_index=True)
train_df_75 = train_df_75.sample(frac=1, random_state=42).reset_index(drop=True)
train_df_75 = train_df_75.drop_duplicates(subset='text', keep='first')
train_df_75.to_csv("/content/drive/MyDrive/Датасеты/RuWB_train_75.csv", index=False)
wb = wb.drop(neutral_add_75.index)
count_by_rating(wb, "WB после 7 очистки")
###########################################################################################
train_df_100 = train_df_75.copy()
neutral_add_100 = wb[wb["rating"] == 3].head(14000)
train_df_100 = pd.concat([train_df_100, neutral_add_100], ignore_index=True)
train_df_100 = train_df_100.sample(frac=1, random_state=42).reset_index(drop=True)
train_df_100 = train_df_100.drop_duplicates(subset='text', keep='first')
train_df_100.to_csv("/content/drive/MyDrive/Датасеты/RuWB_train_100.csv", index=False)
wb = wb.drop(neutral_add_100.index)
count_by_rating(wb, "WB после 8 очистки")
###########################################################################################
print('\n Распределение отзывов по классам в получившихся датасетах \n')
count_by_rating(test_df1, "RuReviews_test_21000.csv")
count_by_rating(test_df2, "RuWB_test_21000.csv")
count_by_rating(val_df, "RuWB_validation_21000.csv")
count_by_rating(train_df4, "RuWB_train_balanced.csv")
count_by_rating(train_df_35, "RuWB_train_35.csv")
count_by_rating(train_df_55, "RuWB_train_55.csv")
count_by_rating(train_df_75, "RuWB_train_75.csv")
count_by_rating(train_df_100, "RuWB_train_100.csv")


RuReviews после 1 очистки
rating
1    27724
3    27407
5    27797
Name: count, dtype: int64

WB после 1 очистки
rating
1    116972
3    120905
5    106372
Name: count, dtype: int64

RuReviews после 2 очистки
rating
1    20724
3    20407
5    20797
Name: count, dtype: int64

RuReviews после 3 очистки
rating
1    17224
3    16907
5    17297
Name: count, dtype: int64

WB после 2 очистки
rating
1    113472
3    117405
5    102872
Name: count, dtype: int64

RuReviews после 4 очистки
rating
1    13724
3    13407
5    13797
Name: count, dtype: int64

WB после 3 очистки
rating
1    109972
3    113905
5     99372
Name: count, dtype: int64

RuReviews после 5 очистки
Series([], Name: count, dtype: int64)

WB после 4 очистки
rating
1    67696
3    71312
5    57169
Name: count, dtype: int64

WB после 5 очистки
rating
1    67696
3    51712
5    57169
Name: count, dtype: int64

WB после 6 очистки
rating
1    67696
3    40512
5    57169
Name: count, dtype: int64

WB после 7 очистки
rating
1    67696


# Проверка на наличие одинаковых отзывов

In [None]:
import pandas as pd

train = pd.read_csv('/content/drive/MyDrive/Датасеты/RuWB_train_balanced.csv')
val = pd.read_csv('/content/drive/MyDrive/Датасеты/RuWB_validation_21000.csv')
test1 = pd.read_csv('/content/drive/MyDrive/Датасеты/RuReviews_test_21000.csv')
test2 = pd.read_csv('/content/drive/MyDrive/Датасеты/RuWB_test_21000.csv')

def check(df, dataset_name):
    duplicates = df[df.duplicated(subset=['text'], keep=False)]
    num_duplicates = duplicates.shape[0]
    if num_duplicates > 0:
        print(f"Найдено {num_duplicates} дубликатов в {dataset_name}.")
        print(duplicates[['text', 'rating']])
    else:
        print(f"Дубликатов не найдено в {dataset_name}.")
    return num_duplicates

num_for_train = check(train, "train")
num_for_val = check(val, "val")
num_for_test1 = check(test1, "test1")
num_for_test2 = check(test2, "test2")

combined_df = pd.concat([train, val, test1, test2], ignore_index=True)
duplicates = combined_df[combined_df.duplicated(subset='text', keep=False)]

if duplicates.empty:
    print("Нет дубликатов в объединенном датасете")
else:
    print(f"Общее количество дубликатов: {len(duplicates)}")
    print(duplicates)


Дубликатов не найдено в train.
Дубликатов не найдено в val.
Дубликатов не найдено в test1.
Дубликатов не найдено в test2.
Нет дубликатов в объединенном датасете


# Обучение

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, f1_score
from torch.utils.data import Dataset

class RuWBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_df = pd.read_csv('/content/drive/MyDrive/Датасеты/RuWB_train_55.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Датасеты/RuWB_validation_21000.csv')
test_ru_df = pd.read_csv('/content/drive/MyDrive/Датасеты/RuReviews_test_21000.csv')
test_wb_df = pd.read_csv('/content/drive/MyDrive/Датасеты/RuWB_test_21000.csv')
label_mapping = {1: 0, 3: 1, 5: 2}
train_df['label'] = train_df['rating'].map(label_mapping)
val_df['label'] = val_df['rating'].map(label_mapping)
test_ru_df['label'] = test_ru_df['rating'].map(label_mapping)
test_wb_df['label'] = test_wb_df['rating'].map(label_mapping)
train_texts = train_df['text'].tolist()
y_train = train_df['label'].values
val_texts = val_df['text'].tolist()
y_val = val_df['label'].values
test_ru_texts = test_ru_df['text'].tolist()
y_ru_test = test_ru_df['label'].values
test_wb_texts = test_wb_df['text'].tolist()
y_wb_test = test_wb_df['label'].values

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruBert-base", num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_dataset = RuWBDataset(train_texts, y_train, tokenizer)
val_dataset = RuWBDataset(val_texts, y_val, tokenizer)
test_ru_dataset = RuWBDataset(test_ru_texts, y_ru_test, tokenizer)
test_wb_dataset = RuWBDataset(test_wb_texts, y_wb_test, tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    macro_f1 = f1_score(labels, predictions, average='macro')
    return {"macro_f1": macro_f1}

training_args = TrainingArguments(
    output_dir="./RuBERT_last",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.1,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="macro_f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model("/content/drive/MyDrive/RuBERT_55")
tokenizer.save_pretrained("/content/drive/MyDrive/RuBERT_55")

train_predictions = trainer.predict(train_dataset)
y_train_pred = train_predictions.predictions.argmax(axis=1)
print(classification_report(y_train, y_train_pred))
print("F1 (Train):", round(f1_score(y_train, y_train_pred, average='macro')*100, 2))

val_predictions = trainer.predict(val_dataset)
y_val_pred = val_predictions.predictions.argmax(axis=1)
print(classification_report(y_val, y_val_pred))
print("F1 (Validation):", round(f1_score(y_val, y_val_pred, average='macro')*100, 2))

test_ru_predictions = trainer.predict(test_ru_dataset)
y_test_ru_pred = test_ru_predictions.predictions.argmax(axis=1)
print(classification_report(y_ru_test, y_test_ru_pred))
print("F1 (RuReviwes test):", round(f1_score(y_ru_test, y_test_ru_pred, average='macro')*100, 2))

test_wb_predictions = trainer.predict(test_wb_dataset)
y_test_wb_pred = test_wb_predictions.predictions.argmax(axis=1)
print(classification_report(y_wb_test, y_test_wb_pred))
print("F1 (WB test):", round(f1_score(y_wb_test, y_test_wb_pred, average='macro')*100, 2))