Лабораторная работа №4: Автоматическое распознавание эмоциональной окраски речи

Цель: Анализировать эмоциональную окраску речи с помощью методов NLP и глубоких нейронных сетей.
Задания:
- Собрать данные из социальных медиа (например, Twitter или VKontakte) и разметить их вручную по категориям эмоций (радость, грусть, гнев и др.).Метрика оценки: Точность классификации, Recall, Precision, F1-score.
- Применить предобученную модель (RoBERTa, XLNet) для анализа эмоциональной окраски текста.Модели сравнения: SVM, Naive Bayes, KNN


In [32]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np

from datasets import Dataset, DatasetDict 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import label_binarize

from peft import (LoraConfig, 
                  prepare_model_for_kbit_training, 
                  get_peft_model,
                  PeftModelForSequenceClassification,
                  PeftConfig)

from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    AutoModelForCausalLM,
    Gemma3ForCausalLM)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

import bitsandbytes as bnb
import evaluate
import os
import warnings

warnings.filterwarnings('ignore')

graphic_card = '0'
gpu_device = 'cuda:0'
os.environ['CUDA_VISIBLE_DEVICES'] = graphic_card
os.environ['CUDA_DEVICE_ORDER']= 'PCI_BUS_ID'
device = torch.device(f'cuda:{graphic_card}' if torch.cuda.is_available() else 'cpu')

torch.cuda.set_device(0) 

In [3]:
def load_parquet_dataset(path):
    df = pd.read_parquet(path)
    return Dataset.from_pandas(df, preserve_index=False)

train_path = 'data/train-00000-of-00001.parquet'
test_path = 'data/test-00000-of-00001.parquet'

train_ds = load_parquet_dataset(train_path)
test_ds = load_parquet_dataset(test_path)

dataset = DatasetDict({
    'train': train_ds,
    'test': test_ds,
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
LABEL_MAP = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise',
}

def load_parquet_dataset(path):
    df = pd.read_parquet(path)
    df['label_str'] = df['label'].map(LABEL_MAP)
    return Dataset.from_pandas(df, preserve_index=False)

train_path = 'data/train-00000-of-00001.parquet'
test_path = 'data/test-00000-of-00001.parquet'

train_ds = load_parquet_dataset(train_path)
test_ds = load_parquet_dataset(test_path)

dataset = DatasetDict({
    'train': train_ds,
    'test': test_ds,
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_str'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_str'],
        num_rows: 2000
    })
})

In [6]:
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,text,label,label_str
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [16]:
class2id = {
    'sadness': 0,
    'joy': 1,
    'love': 2,
    'anger': 3,
    'fear': 4,
    'surprise': 5,
}
id2class = {v: k for k, v in class2id.items()}

hugging_face_model_id = 'google/gemma-3-4b-it'

tokenizer = AutoTokenizer.from_pretrained(
    hugging_face_model_id,
    padding_side='right',
    add_bos_token=True,
    trust_remote_code=True
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    prompt = 'Determine the emotional tonality of the comment: '
    texts = [prompt + t for t in examples['text']]
    enc = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=64
    )
    enc['labels'] = examples['label']
    return enc


dataset_tokenized = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['text', 'label', 'label_str']
)

dataset_tokenized

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [18]:
sample_index = 0 

sample_input_ids = dataset_tokenized['train']['input_ids'][sample_index]
sample_label = dataset_tokenized['train']['labels'][sample_index]

print(f'IDs   : {sample_input_ids}')
print(f'Label : {sample_label}  -->  {id2class[sample_label]}\n')
print(f'Tokens: {tokenizer.decode(sample_input_ids)}')

IDs   : [2, 102752, 506, 13690, 7998, 2027, 529, 506, 5739, 236787, 858, 59568, 2597, 179753, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Label : 0  -->  sadness

Tokens: <bos>Determine the emotional tonality of the comment: i didnt feel humiliated<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

sample_batch_ids = dataset_tokenized['train']['input_ids'][0:3]
sample_batch_ids_collator = data_collator(dataset_tokenized['train'][:3])['input_ids'][0:3]
print([len(x) for x in sample_batch_ids ])
print([len(x) for x in sample_batch_ids_collator ])

[64, 64, 64]
[64, 64, 64]


In [20]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16)


model = Gemma3ForCausalLM.from_pretrained(hugging_face_model_id, 
                                          torch_dtype=torch.bfloat16, 
                                          device_map=gpu_device,
                                          attn_implementation='eager',
                                          quantization_config=bnb_config  )

model.lm_head = torch.nn.Linear(model.config.hidden_size, len(class2id.keys()), bias=False, device=gpu_device)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit 
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names: 
            lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
modules = ['gate_proj', 'down_proj', 'v_proj', 'k_proj', 'q_proj', 'o_proj', 'up_proj']
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.1,
    bias='none',
    task_type='SEQ_CLS')

model = get_peft_model(model, lora_config)

In [22]:
class Gemma3ForSequenceClassification(PeftModelForSequenceClassification):
    def __init__(self, peft_config: PeftConfig, model: AutoModelForCausalLM, adapter_name='default'):
        super().__init__(model, peft_config, adapter_name)
        self.num_labels = model.config.num_labels
        self.problem_type = 'multi_label_classification'

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs):
        
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs)

        logits = outputs.logits

        sequence_lengths = torch.sum(attention_mask, dim=1)
        last_token_indices = sequence_lengths - 1
        batch_size = logits.shape[0]
       
        logits = logits[torch.arange(batch_size, device=logits.device), last_token_indices, :]

        loss = None
        if labels is not None:
            if self.problem_type == 'regression':
                loss_fct = torch.nn.MSELoss()
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss_fct = torch.nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

In [25]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels').to(model.device)
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return (loss, outputs) if return_outputs else loss
    
clf_metrics = evaluate.combine(['accuracy', 'f1', 'precision', 'recall'])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    labels_binarized = label_binarize(labels, classes=[0,1,2,3,4,5])
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()

    try:
        auc = roc_auc_score(labels_binarized, probs, average='macro', multi_class='ovr')
    except ValueError:
        auc = float('nan') 

    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro'),
        'precision_macro': precision_score(labels, preds, average='macro'),
        'recall_macro': recall_score(labels, preds, average='macro'),
        'roc_auc_ovr': auc,
    }

In [26]:
early_stop = EarlyStoppingCallback(early_stopping_patience=3, 
                                   early_stopping_threshold=0.001) 
checkpoints_dir = 'results/gemma_emotions_classification' 

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [27]:
training_args = TrainingArguments(
    gradient_checkpointing=False,  
    gradient_checkpointing_kwargs={'use_reentrant': False},
    logging_strategy='steps',
    logging_steps=100,
    dataloader_num_workers=4,
    output_dir= checkpoints_dir ,  
    learning_rate=5e-5,  
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,  
    num_train_epochs=10,  
    weight_decay=0.01,  
    eval_strategy='steps', 
    eval_steps=100,     
    save_strategy='steps',
    save_steps=100,  
    report_to='none',
    load_best_model_at_end=True,  
    push_to_hub=False,  
    bf16=True,
    warmup_ratio=0.05, 
    metric_for_best_model='eval_f1_macro',
    greater_is_better=True)  

In [29]:
peft_config = PeftConfig(peft_type='LORA', task_type='SEQ_CLS', inference_mode=False)
for key, value in lora_config.__dict__.items():
    setattr(peft_config, key, value)

wrapped_model = Gemma3ForSequenceClassification(peft_config, model)
wrapped_model.num_labels = len(class2id.keys())
wrapped_model.config.id2label = id2class
wrapped_model.config.label2id = class2id
wrapped_model.config.problem_type = 'single_label_classification'

trainer = Trainer(
    model=wrapped_model,
    args=training_args,
    train_dataset=dataset_tokenized['train'],
    eval_dataset=dataset_tokenized['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stop],
)

trainer.label_names = ['labels']

trainer.train()
trainer.evaluate()

No label_names provided for model class `Gemma3ForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,Roc Auc Ovr
100,1.744,1.487775,0.4415,0.181611,0.18776,0.227137,0.639179
200,1.0947,0.664278,0.7775,0.696831,0.746317,0.695992,0.939403
300,0.4754,0.381475,0.862,0.793263,0.848513,0.760413,0.983574
400,0.2805,0.229384,0.9245,0.881519,0.896017,0.872764,0.992274
500,0.2066,0.195701,0.926,0.888316,0.884768,0.898797,0.995196
600,0.1352,0.170509,0.927,0.885694,0.886946,0.885009,0.996383
700,0.1354,0.169741,0.9275,0.880569,0.898763,0.866293,0.996406
800,0.129,0.164837,0.9305,0.875048,0.909236,0.852725,0.996519


{'eval_loss': 0.19570104777812958,
 'eval_accuracy': 0.926,
 'eval_f1_macro': 0.8883164035049403,
 'eval_precision_macro': 0.884768331457693,
 'eval_recall_macro': 0.8987973603412537,
 'eval_roc_auc_ovr': 0.995195725986641,
 'eval_runtime': 33.8311,
 'eval_samples_per_second': 59.117,
 'eval_steps_per_second': 1.862,
 'epoch': 1.6}

In [None]:
wrapped_model.eval().to(device)

def predict_emotion(text: str):
    prompt = 'Determine the emotional tonality of the comment: '
    enc = tokenizer(
        prompt + text,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=64
    ).to(device)
    
    with torch.no_grad():
        outputs = wrapped_model(**enc)
        logits = outputs.logits       
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
        pred_id = int(np.argmax(probs))
    
    return {
        'label_id': pred_id,
        'label_str': id2class[pred_id],
        'probabilities': { id2class[i]: float(probs[i]) for i in range(len(probs)) }
    }

res = predict_emotion("I can't wait for my vacation next week!")
print(f'Predicted: {res['label_str']} (id={res['label_id']})')
print('All probs:', res['probabilities'])

Predicted: joy (id=1)
All probs: {'sadness': 0.04197107255458832, 'joy': 0.8283224701881409, 'love': 0.0260604340583086, 'anger': 0.06693872064352036, 'fear': 0.026835216209292412, 'surprise': 0.009872124530375004}


In [None]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

n_classes = len(set(y_train))
y_test_binarized = label_binarize(y_test, classes=list(range(n_classes)))

models = {
    'SVM (linear kernel)': SVC(kernel='linear', probability=True, random_state=42),
    'MultinomialNB': MultinomialNB(),
    'KNN (k=5)': KNeighborsClassifier(n_neighbors=5),
}

results = {}
for name, clf in models.items():
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    probs = clf.predict_proba(X_test)

    results[name] = {
        'accuracy': accuracy_score(y_test, preds),
        'precision_macro': precision_score(y_test, preds, average='macro'),
        'recall_macro': recall_score(y_test, preds, average='macro'),
        'f1_macro': f1_score(y_test, preds, average='macro'),
        'roc_auc_ovr': roc_auc_score(y_test_binarized, probs,
                                          average='macro',
                                          multi_class='ovr'),
    }

for name, metrics in results.items():
    print(f'\n=== {name} ===')
    for metric_name, value in metrics.items():
        print(f'{metric_name:15s}: {value:.4f}')

== SVM (linear kernel) ===
accuracy       : 0.8895
precision_macro: 0.8552
recall_macro   : 0.8118
f1_macro       : 0.8302
roc_auc_ovr    : 0.9879

=== MultinomialNB ===
accuracy       : 0.7145
precision_macro: 0.7039
recall_macro   : 0.4634
f1_macro       : 0.4780
roc_auc_ovr    : 0.9519

=== KNN (k=5) ===
accuracy       : 0.7255
precision_macro: 0.7345
recall_macro   : 0.6183
f1_macro       : 0.6589
roc_auc_ovr    : 0.8937
