In [None]:
!pip install razdel



In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from razdel import tokenize, sentenize
import copy

tqdm.pandas()

device = torch.device('cuda')

ModuleNotFoundError: ignored

In [None]:
bert = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

In [None]:
texts, ids = [], []
with open('/content/train_reviews.txt') as f:
    for line in f:
        text_id, text = line.rstrip('\r\n').split('\t')
        texts.append(text)
        ids.append(text_id)

In [None]:
train_texts, dev_texts, train_ids, dev_ids = train_test_split(texts, ids)

In [None]:
train_sentiment, dev_sentiment = [], []
with open('/content/train_cats.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_sentiment.append(line)
        if text_id in dev_ids:
            dev_sentiment.append(line)

In [None]:
train_aspects, dev_aspects = [], []
with open('train_aspects.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_aspects.append(line)
        if text_id in dev_ids:
            dev_aspects.append(line)

In [None]:
with open('/content/train_split_reviews.txt', 'w') as f:
    f.write("id\ttext\n")
    for i, l in zip(train_ids, train_texts):
        print(i, l, sep="\t", file=f)
with open('/content/dev_reviews.txt', 'w') as f:
    f.write("id\ttext\n")
    for i, l in zip(dev_ids, dev_texts):
        print(i, l, sep="\t", file=f)
with open('/content/train_split_cats.txt', 'w') as f:
    f.write("id\taspect\tsentiment\n")
    for l in train_sentiment:
        print(l, file=f)
with open('/content/dev_cats.txt', 'w') as f:
    f.write("id\taspect\tsentiment\n")
    for l in dev_sentiment:
        print(l, file=f)

In [None]:
import pandas as pd

# Чтение файла train_reviews.txt в DataFrame
reviews_df = pd.read_csv('/content/train_reviews.txt', delimiter='\t')

# Чтение файла train_sentiment.txt в DataFrame
sentiment_df = pd.read_csv('/content/train_cats.txt', delimiter='\t')

# Объединение двух DataFrame по колонке 'id'
merged_df = pd.merge(reviews_df, sentiment_df, on='id')

# Создание обучающих данных для классификатора
training_data = merged_df[['text', 'sentiment']]

training_data.head()


Unnamed: 0,text,sentiment
0,Буквально на днях отмечали с мужем наш небольш...,positive
1,Буквально на днях отмечали с мужем наш небольш...,positive
2,Буквально на днях отмечали с мужем наш небольш...,absence
3,Буквально на днях отмечали с мужем наш небольш...,positive
4,Буквально на днях отмечали с мужем наш небольш...,positive


In [None]:
def clause_splitter(text):
    splitters = ':;,().…'

    conj = ['а', 'но', 'однако', 'чтобы', 'потому что', 'если',
        'несмотря на', 'хотя', 'так что', 'или', 'либо',]

    result = []
    for sent in list(sentenize(text)):
        s = sent.text.lower()
        parts = [s,]
        for splt in splitters:
            new_parts = []
            for part in parts:
                new_parts.extend(part.split(splt))
            parts = copy.deepcopy(new_parts)


        good_parts = []
        for part in parts:
            part = part.strip(' ')
            if len(part) > 5:
                if len(part.split(' ')) < 8:
                    good_parts.append(part)
                else:
                    splitted = False
                    for c in conj:
                        if f' {c} ' in part and not part.startswith(f'{c} '):
                            good_parts.extend(part.split(f' {c} '))
                            splitted = True
                            break
                    if not splitted:
                        good_parts.append(part)

        # наконец, место в тексте
        for gp in good_parts:
            if len(gp) > 5:
                result.append((gp.strip(' '),
                              sent.start+s.index(gp),
                              sent.start+s.index(gp)+len(gp)))
    return result

In [None]:
training_data['text'] = training_data['text'].apply(clause_splitter)
# Применяем explode к столбцу "sentences"
df_exploded = training_data.explode('text')

# Создаем новые столбцы на основе элементов в кортежах
df_exploded[['text', 'start', 'end']] = pd.DataFrame(df_exploded['text'].tolist(), index=df_exploded.index)

# Выводим окончательный DataFrame с преобразованными данными
df_exploded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['text'] = training_data['text'].apply(clause_splitter)


Unnamed: 0,text,sentiment,start,end
0,буквально на днях отмечали с мужем наш небольш...,positive,0,64
0,выбрали для романтического вечера макарену и о...,positive,66,134
0,меню отменное,positive,135,148
0,столько всего,positive,150,163
0,что решили заходить почаще,positive,165,191
...,...,...,...,...
1064,кстати,positive,958,964
1064,она бесплатная,positive,966,980
1064,в общем и вдвоем романтик и веселой компанией ...,positive,982,1055
1064,ждем лета,positive,1057,1066


In [None]:
training_data

Unnamed: 0,text,sentiment
0,[(буквально на днях отмечали с мужем наш небол...,positive
1,[(буквально на днях отмечали с мужем наш небол...,positive
2,[(буквально на днях отмечали с мужем наш небол...,absence
3,[(буквально на днях отмечали с мужем наш небол...,positive
4,[(буквально на днях отмечали с мужем наш небол...,positive
...,...,...
1060,"[(мороз и солнце - день чудесный!, 0, 31), (за...",positive
1061,"[(мороз и солнце - день чудесный!, 0, 31), (за...",positive
1062,"[(мороз и солнце - день чудесный!, 0, 31), (за...",positive
1063,"[(мороз и солнце - день чудесный!, 0, 31), (за...",positive


In [None]:
# Расширяем спаны для второго датасета
df_exploded['expanded_span'] = list(zip(df_exploded['start'], df_exploded['end']))

def map_aspect(row):
    for span, aspect in zip(df_exploded['expanded_span'], df_exploded['sentiment']):
        if row['start'] >= span[0] and row['end'] <= span[1]:
            return aspect
    return "absence"

# Применяем функцию
df_exploded['labels'] = df_exploded.apply(map_aspect, axis=1)

# Вывод окончательного DataFrame с преобразованными данными
df_exploded

Unnamed: 0,text,sentiment,start,end,expanded_span,labels
0,буквально на днях отмечали с мужем наш небольш...,positive,0,64,"(0, 64)",positive
0,выбрали для романтического вечера макарену и о...,positive,66,134,"(66, 134)",positive
0,меню отменное,positive,135,148,"(135, 148)",positive
0,столько всего,positive,150,163,"(150, 163)",positive
0,что решили заходить почаще,positive,165,191,"(165, 191)",positive
...,...,...,...,...,...,...
1064,кстати,positive,958,964,"(958, 964)",positive
1064,она бесплатная,positive,966,980,"(966, 980)",positive
1064,в общем и вдвоем романтик и веселой компанией ...,positive,982,1055,"(982, 1055)",positive
1064,ждем лета,positive,1057,1066,"(1057, 1066)",positive


In [None]:
df_new = pd.DataFrame(data={"text": df_exploded["text"], "label":df_exploded["labels"]})

In [None]:
# Разделение датафрейма на обучающую и оставшуюся часть (валидационная + тестовая выборки)

train_data, remaining_data = train_test_split(df_new, test_size=0.3, random_state=42)

# Разделение оставшейся части датафрейма на валидационную и тестовую выборки
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

# Вывод размеров каждой выборки
print("Train data size:", len(train_data))
print("Validation data size:", len(val_data))
print("Test data size:", len(test_data))


Train data size: 19491
Validation data size: 4177
Test data size: 4177


In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_data["text"],
    max_length = 50,
    padding = 'max_length',
    truncation = True
)

tokens_val = tokenizer.batch_encode_plus(
    val_data["text"],
    max_length = 50,
    padding = 'max_length',
    truncation = True
)

tokens_test = tokenizer.batch_encode_plus(
    test_data["text"],
    max_length = 50,
    padding = 'max_length',
    truncation = True
)

In [None]:
labels = {
    "positive": 0,
    "negative": 1,
    "neutral": 2,
    "both": 3,
    "absence": 4
}

In [None]:
train_data = [labels[x] for x in train_data["label"]]
val_data  = [labels[x] for x in val_data["label"]]
test_data  = [labels[x] for x in test_data["label"]]

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_data)

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_data)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_data)
batch_size = 8

In [None]:
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

val_data =  TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Sentiment(nn.Module):

    def __init__(self, bert):
        super(BERT_Sentiment, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,256)
        self.fc3 = nn.Linear(256,5)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask = mask, return_dict = False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [None]:
model = BERT_Sentiment(bert)

model = model.to(device)
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr= 1e-5)

In [None]:
cross_entropy = nn.CrossEntropyLoss()
epochs = 10


In [None]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []

    for step, batch in tqdm(enumerate(train_dataloader), total = len(train_dataloader)):
        batch = [r.to(device) for r in batch]
        sent_id,mask,labels = batch
        model.zero_grad()
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)

    avg_loss = total_loss / len(train_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)

    return avg_loss, total_preds

In [None]:
def evaluate():
    model.eval()
    total_loss, total_accuracy = 0,0
    total_preds = []

    for step, batch in tqdm(enumerate(val_dataloader), total = len(val_dataloader)):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    avg_loss = total_loss / len(val_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)
    return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch{:} / {:}'.format(epoch+1, epochs))

    train_loss, _ = train()
    valid_loss, _ = evaluate()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'\nTraining loss: {train_loss:.3f}')
    print(f'Validation loss: {valid_loss:.3f}')


 Epoch1 / 10


100%|██████████| 2437/2437 [01:11<00:00, 34.09it/s]
100%|██████████| 523/523 [00:15<00:00, 34.67it/s]



Training loss: 0.620
Validation loss: 0.571

 Epoch2 / 10


100%|██████████| 2437/2437 [01:12<00:00, 33.61it/s]
100%|██████████| 523/523 [00:15<00:00, 34.49it/s]



Training loss: 0.563
Validation loss: 0.561

 Epoch3 / 10


100%|██████████| 2437/2437 [01:12<00:00, 33.51it/s]
100%|██████████| 523/523 [00:15<00:00, 34.00it/s]



Training loss: 0.556
Validation loss: 0.555

 Epoch4 / 10


100%|██████████| 2437/2437 [01:12<00:00, 33.41it/s]
100%|██████████| 523/523 [00:15<00:00, 34.12it/s]



Training loss: 0.552
Validation loss: 0.551

 Epoch5 / 10


100%|██████████| 2437/2437 [01:12<00:00, 33.66it/s]
100%|██████████| 523/523 [00:15<00:00, 34.16it/s]



Training loss: 0.548
Validation loss: 0.552

 Epoch6 / 10


100%|██████████| 2437/2437 [01:13<00:00, 33.07it/s]
100%|██████████| 523/523 [00:15<00:00, 33.85it/s]



Training loss: 0.547
Validation loss: 0.548

 Epoch7 / 10


100%|██████████| 2437/2437 [01:13<00:00, 33.20it/s]
100%|██████████| 523/523 [00:15<00:00, 34.23it/s]



Training loss: 0.544
Validation loss: 0.546

 Epoch8 / 10


100%|██████████| 2437/2437 [01:13<00:00, 33.18it/s]
100%|██████████| 523/523 [00:15<00:00, 34.03it/s]



Training loss: 0.541
Validation loss: 0.547

 Epoch9 / 10


100%|██████████| 2437/2437 [01:13<00:00, 33.17it/s]
100%|██████████| 523/523 [00:15<00:00, 34.14it/s]



Training loss: 0.539
Validation loss: 0.542

 Epoch10 / 10


100%|██████████| 2437/2437 [01:13<00:00, 33.20it/s]
100%|██████████| 523/523 [00:15<00:00, 33.28it/s]



Training loss: 0.538
Validation loss: 0.540
