**Attention в контексте естественного языка:**
В контексте обработки естественного языка, механизм attention позволяет моделям сосредотачивать внимание на различных словах или фразах в предложении в зависимости от контекста. Такие модели могут эффективно улавливать семантические зависимости и работать с переменной длиной последовательностей.

Основная идея трансформера - использование механизма внимания для обработки входных данных последовательностей, таких как тексты

## Применение одного из трансформеров (BERT) к задаче классификации отзывов клиентов.


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

In [None]:
from google.colab import drive
# Подключение диска
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Загрузка файла с диск
df = pd.read_csv("/content/drive/MyDrive/reviews.csv")
df.head()

Unnamed: 0,date,review,target
0,September 04 2022,Very poor customer serviceVery poor customer s...,1
1,August 13 2023,Your app's functionality to redeem…Your app's ...,1
2,September 01 2023,WU has been obnoxious about money…WU has been ...,1
3,August 01 2023,After I sent the moneyAfter I sent the money. ...,1
4,August 06 2023,Money lostOn 08/16 I send money to Mexico. Alt...,1


In [None]:
# Стандартизируем наши данные
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.replace(r"\s+", " ")
    df[text_field] = df[text_field].str.lower()
    return df


clean_data = standardize_text(df, "review")

  df[text_field] = df[text_field].str.replace(r"http\S+", "")
  df[text_field] = df[text_field].str.replace(r"@\S+", "")
  df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
  df[text_field] = df[text_field].str.replace(r"\s+", " ")


In [None]:
clean_data.head()

Unnamed: 0,date,review,target
0,September 04 2022,very poor customer servicevery poor customer s...,1
1,August 13 2023,your app's functionality to redeem your app's ...,1
2,September 01 2023,wu has been obnoxious about money wu has been ...,1
3,August 01 2023,after i sent the moneyafter i sent the money i...,1
4,August 06 2023,money loston 08 16 i send money to mexico alth...,1


In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(
    clean_data['review'].tolist(),
    clean_data['target'].tolist(),
    test_size=0.2, random_state=42
)

In [None]:
train_labels = [label - 1 for label in train_labels]
train_labels = torch.tensor(train_labels, dtype=torch.long).to(device)

In [None]:
max_length = 128
tokenized_train = tokenizer(train_data, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)

train_dataset = TensorDataset(tokenized_train['input_ids'], tokenized_train['attention_mask'], train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
from tqdm import tqdm

def train_model(model, train_dataloader, optimizer, num_epochs=3):
    device = next(model.parameters()).device

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
        for batch in progress_bar:
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            inputs = {key: value.to(device) for key, value in inputs.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predictions == batch[2].to(device)).sum().item()
            total_samples += len(batch[2])

            progress_bar.set_postfix({'Loss': total_loss / total_samples, 'Accuracy': correct_predictions / total_samples})

        average_loss = total_loss / total_samples
        accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{num_epochs}: Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}')

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 20
train_model(model, train_dataloader, optimizer, num_epochs)



Epoch 1/20: Loss: 0.0438, Accuracy: 0.3514




Epoch 2/20: Loss: 0.0338, Accuracy: 0.5216




Epoch 3/20: Loss: 0.0276, Accuracy: 0.6457




Epoch 4/20: Loss: 0.0223, Accuracy: 0.7202




Epoch 5/20: Loss: 0.0174, Accuracy: 0.7937




Epoch 6/20: Loss: 0.0142, Accuracy: 0.8341




Epoch 7/20: Loss: 0.0095, Accuracy: 0.9096




Epoch 8/20: Loss: 0.0064, Accuracy: 0.9394




Epoch 9/20: Loss: 0.0052, Accuracy: 0.9486




Epoch 10/20: Loss: 0.0033, Accuracy: 0.9736




Epoch 11/20: Loss: 0.0025, Accuracy: 0.9798




Epoch 12/20: Loss: 0.0019, Accuracy: 0.9880




Epoch 13/20: Loss: 0.0015, Accuracy: 0.9880




Epoch 14/20: Loss: 0.0013, Accuracy: 0.9889




Epoch 15/20: Loss: 0.0010, Accuracy: 0.9928




Epoch 16/20: Loss: 0.0018, Accuracy: 0.9846




Epoch 17/20: Loss: 0.0012, Accuracy: 0.9885




Epoch 18/20: Loss: 0.0010, Accuracy: 0.9894




Epoch 19/20: Loss: 0.0013, Accuracy: 0.9865


                                                                                           

Epoch 20/20: Loss: 0.0007, Accuracy: 0.9962




In [None]:
positive_review = "Western Union is one of the best ways to transfer money to your loved one’s. Very reliable and always on time!"
tokenized_positive_review = tokenizer(positive_review, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)

model.eval()
with torch.no_grad():
    inputs = {'input_ids': tokenized_positive_review['input_ids'], 'attention_mask': tokenized_positive_review['attention_mask']}
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model(**inputs)

predicted_rating = torch.argmax(outputs.logits, dim=1).item() + 1

print(f"\nPositive Review:\n{positive_review}\nPredicted Rating: {predicted_rating}")


Positive Review:
Western Union is one of the best ways to transfer money to your loved one’s. Very reliable and always on time!
Predicted Rating: 4


In [None]:
negative_review = '''The app stop The app stop, I had to refresh twice the page'''

tokenized_negative_review = tokenizer(negative_review, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)

model.eval()
with torch.no_grad():
    inputs = {'input_ids': tokenized_negative_review['input_ids'], 'attention_mask': tokenized_negative_review['attention_mask']}
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model(**inputs)

predicted_rating_negative = torch.argmax(outputs.logits, dim=1).item() + 1

print(f"\nNegative Review:\n{negative_review}\nPredicted Rating: {predicted_rating_negative}")


Negative Review:
The app stop The app stop, I had to refresh twice the page
Predicted Rating: 2


### BERT для генерации текста

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

def generate_text(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

    output = model.generate(input_ids, max_length=max_length, attention_mask=attention_mask,
                           num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7,
                           pad_token_id=tokenizer.eos_token_id)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
prompt_eng = "The quick brown fox"
generated_text_eng = generate_text(prompt_eng)
print(generated_text_eng)



The quick brown foxes have been known to eat a variety of foods, including fruits, vegetables, nuts, and seeds.

In the wild, they can be found in many parts of the United States, Canada, Australia, New Zealand,


In [None]:
! pip install markovify

Collecting markovify
  Downloading markovify-0.9.4.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unidecode (from markovify)
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: markovify
  Building wheel for markovify (setup.py) ... [?25l[?25hdone
  Created wheel for markovify: filename=markovify-0.9.4-py3-none-any.whl size=18606 sha256=41f3eec0d09c299f3bcac583561a8f2840cd6362b7410af62010af1879a82daf
  Stored in directory: /root/.cache/pip/wheels/ca/8c/c5/41413e24c484f883a100c63ca7b3b0362b7c6f6eb6d7c9cc7f
Successfully built markovify
Installing collected packages: unidecode, markovify
Successfully installed markovify-0.9.4 unidecode-1.3.7


In [None]:
import markovify

with open("dostoevsky.txt", "r", encoding="utf-8") as f:
    russian_text = f.read()

russian_model = markovify.Text(russian_text)

In [None]:
for _ in range(3):
    russian_generated_text = russian_model.make_sentence()
    print(russian_generated_text)

А народ наш и к нему шло.
Будьте уверены, Алексей Федорович, все и холодно усмехнулся, подумав это, потому что вы — княгиня, вдова, богачка и, в двадцати от эшафота, около которого стоял народ и оберегайте сердце его.
Бабушка гневно на них каждый раз весело выходит, а под столом сторублевую бумажку поднял и осмотрел даже с некоторым волнением. — Совершенно, совершенно не слыхав того, что он уже умер, и обещал его некролог.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "sberbank-ai/rugpt2large"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.39G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

In [None]:
def generate_text(prompt, max_length=100, temperature=1.0):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # чтобы модель не обращала внимания на токены-паддинги или токены из будущего
    attention_mask = torch.ones(input_ids.shape, device=input_ids.device)

    output = model.generate(input_ids, max_length=max_length, temperature=temperature, attention_mask=attention_mask,
                           num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95,
                           pad_token_id=tokenizer.eos_token_id)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [None]:
prompt = "Буря "
generated_text = generate_text(prompt, max_length=50, temperature=0.8)
print(generated_text)

Буря  на  море  стихла,  и  мы

вышли в открытое море. Волнение на море было умеренным, но ветер дул с такой силой, что, казалось, он вот-вот сорвет с


In [None]:
prompt = "Внимание - это"
generated_text = generate_text(prompt, max_length=50, temperature=0.8)
print(generated_text)

Внимание - это все, что я могу сделать для вас.

- Спасибо, - сказал я. - Вы очень добры, но я не нуждаюсь в вашей доброте. Я просто хочу, чтобы меня оставили в покое, и я


In [None]:
prompt = "Как спасти мир? "
generated_text = generate_text(prompt, max_length=50, temperature=0.8)
print(generated_text)

Как спасти мир? 
Даниил Александрович Гранин

Евгений Александрович Евтушенко 

Виктор Петрович Астафьев  

Александр Александрович Солженицын. «Архипелаг ГУЛАГ»
В сборник вошли произведения


In [None]:
prompt = "В жизни бывает два гуся"
generated_text = generate_text(prompt, max_length=50, temperature=0.8)
print(generated_text)

В жизни бывает два гуся, — сказал он.

— Как это? Что ты имеешь в виду? Я не понимаю, о чем ты говоришь. Ты хочешь сказать, что я не гусь, а просто гусыня?



### Машинный перевод с помощью BERT

In [None]:
!pip install "transformers[sentencepiece]"

Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

mname = "Helsinki-NLP/opus-mt-en-ru"

tokenizer = AutoTokenizer.from_pretrained(mname)
model = AutoModelForSeq2SeqLM.from_pretrained(mname)

tokenizer.save_pretrained('./model/en-ru-local')
model.save_pretrained('./model/en-ru-local')

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
input_text = "Hello, how are you?"

inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(translated_text)


Привет, как дела?


In [None]:
input_text = "My name is"

inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(translated_text)

Меня зовут
