In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torchtext import data
from torch.utils.data import DataLoader
from torch.optim import AdamW

from tqdm.auto import tqdm
import evaluate

#!pip install datasets transformers evaluate
#!pip install accelerate -U

from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler
from transformers import pipeline

##Подготовка данных

In [13]:
df = pd.read_csv('/content/drive/MyDrive/Инлайн/meatinfo.csv', delimiter=';')
df

Unnamed: 0,text,mtype
0,12 частей баранина 12 частей баранина,Баранина
1,"Баранина, 12 частей, зам. цена 260 руб.",Баранина
2,"Баранина, 12 частей, зам. цена 315 руб.",Баранина
3,"Баранина, 12 частей, охл.",Баранина
4,"Баранина, 12 частей, охл. цена 220 руб.",Баранина
...,...,...
17888,"Ягнятина, шея, бк",Ягнятина
17889,Язык ягненка (н.зеландия) Отварные языки ягнят...,Ягнятина
17890,"Ягнятина, язык, зачищ. цена 100 руб.",Ягнятина
17891,"Як, задние части, 1 категория цена 550 руб.",Як


In [14]:
df.mtype.value_counts()

Говядина                                                                                   8422
Свинина                                                                                    3050
Кура                                                                                       1571
Индейка                                                                                    1337
Баранина                                                                                   1116
Цыпленок                                                                                    942
Кролик                                                                                      334
Утка                                                                                        195
Оленина                                                                                     193
Конина                                                                                      176
Гусь                                    

In [None]:
df.loc[(df['mtype'] == 'Feb-20') |
         (df['mtype'] == '125р.') |
         (df['mtype'] == 'OFFAL EXP №4407 Аргентина') |
         (df['mtype'] == '(OFFAL EXP №4407 Аргентина)') |
         (df['mtype'] == 'Mar-20') |
         (df['mtype'] == '295,00 руб|кг')
         ]

In [5]:
df = df.replace({'свинина': 'Свинина',
                     'свиниеа': 'Свинина',
                     'Говядина, полутуши, 1 категория,  охл., Россия, подвес, В наличии, 10 тонн, 270 руб. кг' : 'Говядина',
                     'говядина': 'Говядина',
                     ' Лопаточная часть (Chuck) буйвол ': 'Буйволятина',
                     'утка': 'Утка',
                     'цыпленок': 'Цыпленок',
                     'индейка': 'Индейка',
                     'Feb-20' : 'Кура',
                     '125р.' : 'Кура',
                     'Mar-20' : 'Кура',
                     '(OFFAL EXP №4407 Аргентина)' : 'Говядина',
                     'OFFAL EXP №4407 Аргентина' : 'Говядина',
                     '295,00 руб|кг' : 'Свинина',
                     })

In [None]:
df.mtype.value_counts()

In [None]:
all_labels = list(df.mtype.unique())
all_labels

In [None]:
for t in all_labels:
  df = df.replace(t, all_labels.index(t))
df.sample(10)

In [None]:
df.rename(columns = {'mtype':'label'}, inplace=True)
df

In [10]:
df['text'] = df['text'].astype(str)
df['label'] = df['label'].astype(int)

In [11]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
my_dataset_dict = DatasetDict({"train":train_dataset,"test":test_dataset})

##Обучение модели

In [14]:
tokenizer = AutoTokenizer.from_pretrained("seara/rubert-tiny2-russian-sentiment")

In [15]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/14314 [00:00<?, ? examples/s]

Map:   0%|          | 0/3579 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [17]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14314
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3579
    })
})

In [18]:
tokenized_datasets.set_format("torch")

In [19]:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

In [20]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(test_dataset, batch_size=8)

In [21]:
model = AutoModelForSequenceClassification.from_pretrained("seara/rubert-tiny2-russian-sentiment", num_labels=36, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at seara/rubert-tiny2-russian-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([36]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 312]) in the checkpoint and torch.Size([36, 312]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [23]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [25]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/5370 [00:00<?, ?it/s]

In [26]:
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9636770047499301}

In [34]:
model.save_pretrained('/content/drive/MyDrive/Инлайн/Model')

##Тестирование

In [3]:
all_labels = ['Баранина','Ягнятина', 'Индейка','Говядина', 'Свинина', 'Кура', 'Цыпленок', 'Гусь', 'Буйволятина', 'Оленина', 'Конина',
              'Телятина', 'Кролик', 'Утка', 'Куропатка', 'Парагвай', 'Перепел', 'Глухарь', 'Страус', 'nan', 'Заяц', 'Кенгуру','Изюбр',
              'Кабан', 'Коза', 'Косуля', 'Лось', 'Марал', 'Медвежатина', 'Бобер', 'Цесарка', 'Нутрия', 'Рябчик', 'Тетерев', 'Фазан', 'Як']

In [23]:
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Инлайн/Model")
tokenizer = AutoTokenizer.from_pretrained("seara/rubert-tiny2-russian-sentiment")

input_text = 'Продам перепела и перепелинную разделку гост и халяль по хорошей цене .Тел:'

clf = pipeline("text-classification", model, tokenizer=tokenizer)
answer = clf(input_text)
print(answer)

[{'label': 'LABEL_16', 'score': 0.6944593191146851}]


In [24]:
answer_label = answer[0]['label']
index = int(answer_label.split('_')[1])
print('text: ', input_text)
print('label: ', all_labels[index])

text:  Продам перепела и перепелинную разделку гост и халяль по хорошей цене .Тел:
label:  Перепел
