In [4]:
data_path = 'C:\\Users\\12390\\Documents\\projects\\yolo\\data\\actions.csv'

# 一、加载数据

In [3]:
import pandas as pd

In [10]:
df = pd.read_csv(data_path)
df['action'] = df['action'].str.strip()

## 1.1 观察

In [11]:
df.head()

Unnamed: 0,text,action
0,你想不想去吃午饭？,其他
1,哦！我被选中了！,其他
2,我几天身体好像有点不太舒服，肚子好痛。,其他
3,我的小组成员一个都没干活！真后悔跟他一起组队。,摇头
4,他们是不是吵架了？不会打起来吧？,思考


In [12]:
df.shape

(4526, 2)

In [13]:
df['action'].value_counts()

action
其他    2790
摇头     571
思考     498
挥手     353
点头     314
Name: count, dtype: int64

## 1.2 采样

In [15]:
df_ac = df[~df['action'].isin(['其他'])]
df_b_sample = df[df['action'] == '其他'].sample(n=600, random_state=42)
df = pd.concat([df_ac, df_b_sample], ignore_index=True)

In [16]:
df.shape

(2336, 2)

In [17]:
df['action'].value_counts()

action
其他    600
摇头    571
思考    498
挥手    353
点头    314
Name: count, dtype: int64

## 1.3 划分训练集、验证机、测试集

In [18]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
train_df, test_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df['action']
)
train_df, validation_df = train_test_split(
    train_df,
    test_size=0.1,
    random_state=21,
    stratify=train_df['action']
)

In [23]:
train_df.shape, test_df.shape, validation_df.shape

((1891, 2), (234, 2), (211, 2))

## 1.4 转为dataset格式

In [24]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset
})

In [25]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'action', '__index_level_0__'],
        num_rows: 1891
    })
    test: Dataset({
        features: ['text', 'action', '__index_level_0__'],
        num_rows: 234
    })
    validation: Dataset({
        features: ['text', 'action', '__index_level_0__'],
        num_rows: 211
    })
})

## 1.5 标签映射

In [26]:
label2id = {
    '点头': 0,
    '挥手': 1,
    '其他': 2,
    '思考': 3,
    '摇头': 4,
}
id2label = {v: k for k, v in label2id.items()}

In [27]:
def convert_labels(examples):
    examples["labels"] = [label2id[label] for label in examples["action"]]
    return examples


dataset = dataset_dict.map(
    convert_labels,
    batched=True,
    remove_columns=["action"]
)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1891/1891 [00:00<00:00, 235745.72 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 78254.44 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 211/211 [00:00<00:00, 41696.03 examples/s]


In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', '__index_level_0__', 'labels'],
        num_rows: 1891
    })
    test: Dataset({
        features: ['text', '__index_level_0__', 'labels'],
        num_rows: 234
    })
    validation: Dataset({
        features: ['text', '__index_level_0__', 'labels'],
        num_rows: 211
    })
})

In [29]:
dataset['train'][0]

{'text': '你怎么这么烦，真是无止境，让我受不了！', '__index_level_0__': 772, 'labels': 4}

# 一、训练

In [37]:
model_path = 'C:\\Users\\12390\\Documents\\projects\\yolo\\data\\bert-base-chinese'
num_labels = 5

In [32]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)

## 1.1 加载模型

In [38]:
tokenizer = BertTokenizer.from_pretrained(model_path)

model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at C:\Users\12390\Documents\projects\yolo\data\bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 1.2 对数据进行tokenize

In [34]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",
        return_overflowing_tokens=False
    )

tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=16)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 1891/1891 [00:00<00:00, 4533.73 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 3961.14 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 211/211 [00:00<00:00, 5042.24 examples/s]


In [35]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', '__index_level_0__', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1891
    })
    test: Dataset({
        features: ['text', '__index_level_0__', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 234
    })
    validation: Dataset({
        features: ['text', '__index_level_0__', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 211
    })
})

In [36]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [39]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 1.3 训练

In [41]:
trained_model_path = 'C:\\Users\\12390\\Documents\\projects\\yolo\\data\\action_model'

In [43]:
import os

In [46]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [47]:
training_args = TrainingArguments(
    output_dir=trained_model_path, 
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=os.path.join(data_path, "logs"),
    logging_steps=100,
    learning_rate=3e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1, 
    weight_decay=0.01,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 二、评估

In [None]:
test_results = trainer.evaluate(tokenized_dataset["test"])
print("测试集评估结果：", test_results)

# 三、加载

In [None]:
tokenizer = BertTokenizer.from_pretrained(os.path.join(data_path, "action_model/checkpoint-318"))
model = BertForSequenceClassification.from_pretrained(os.path.join(data_path, "action_model/checkpoint-318"))

# 四、预测

In [None]:
import torch

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=128,
        padding="max_length",
        return_tensors="pt"
    ).to(model.device)


    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=1).item()
    return predictions

In [None]:

# sample_text = "这部电影太棒了！特效非常震撼！"
# sample_text = "我不太明白你的意思，你能再说一遍"
sample_text = "抱歉，没有查到相关信息呢。不过不管是不是第一批，来参观就有新发现，好好享受科技馆之旅吧！"
pred_label = predict_sentiment(sample_text)
print(f"预测动作标签：{pred_label}")
id2label[pred_label]