In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
MODEL_NAME = "./models/self_multi"
# MODEL_NAME = "./models/sbert_plus_multi"
MODEL_TASK = "sentiment-analysis"

LABELS = ['greeting', 'how are you', 'unknown']
LABEL_MAP = {i: LABELS[i] for i in range(0, len(LABELS))}
ID_MAP = {value: key for key, value in LABEL_MAP.items()}

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, id2label=LABEL_MAP, label2id=ID_MAP)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, return_tensors="pt", padding=True, truncation=True, max_length=128)
clf = pipeline(task=MODEL_TASK, model=model, tokenizer=tokenizer, top_k=3)

In [2]:
import pandas as pd
import numpy as np

DATASET_NAME = "./datasets/ru-plus.csv"

df = pd.read_csv(DATASET_NAME, delimiter="|")
df.columns = ['text', 'label']
df['label'] = df['label'].astype(int)
df_label_0 = df[df["label"] == 0]
df_label_1 = df[df["label"] == 1]
multi_class = []
for i in range(50):
    row_0 = df_label_0.sample(n=1, random_state=np.random.RandomState())
    row_1 = df_label_1.sample(n=1, random_state=np.random.RandomState())
    text = f"{row_0['text'].values[0]}, {row_1['text'].values[0].lower()}"
    multi_class.append({'text': text, 'label': 3})

df = pd.concat([df, pd.DataFrame(multi_class)], ignore_index=True)
df['label'] = df['label'].map({0: [1, 0, 0], 1: [0, 1, 0], 2: [0, 0, 1], 3: [1, 1, 0]})

texts = df['text'].tolist()
labels = df['label'].tolist()

In [3]:
results = clf(texts)

In [4]:
THRESHOLD = 0.9

counter = 0
mapped_results = []

for text, label, result in zip(texts, labels, results):
    res = {el['label']: 1 if el['score'] > THRESHOLD else 0 for el in result}
    mapped = [res['greeting'], res['how are you'], res['unknown']]
    mapped_results.append(mapped)
    if mapped != label:
        counter += 1
        print(f"Текст: {text}")
        print(f"Предсказано: {mapped}, Значение: {label}, Результат: {result}")
        print()
        
print(f"Всего ошибочно: {counter}")

Текст: Что расскажешь?
Предсказано: [0, 0, 0], Значение: [0, 1, 0], Результат: [{'label': 'how are you', 'score': 0.8623228073120117}, {'label': 'unknown', 'score': 0.2193164825439453}, {'label': 'greeting', 'score': 0.005776830483227968}]

Текст: Нормально?
Предсказано: [0, 0, 0], Значение: [0, 0, 1], Результат: [{'label': 'how are you', 'score': 0.7760507464408875}, {'label': 'unknown', 'score': 0.2226688712835312}, {'label': 'greeting', 'score': 0.007004767190665007}]

Текст: Как насчжёт кино вечером?
Предсказано: [0, 0, 0], Значение: [0, 0, 1], Результат: [{'label': 'unknown', 'score': 0.8348129391670227}, {'label': 'how are you', 'score': 0.40166231989860535}, {'label': 'greeting', 'score': 0.0037557545583695173}]

Всего ошибочно: 3


In [5]:
import evaluate
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
', '.join(f"{key}: {value:.4f}" for key, value in metric.compute(predictions=np.array(mapped_results).reshape(-1), references=np.array(labels).reshape(-1)).items())

'accuracy: 0.9964, f1: 0.9954, precision: 1.0000, recall: 0.9909'