# 题目说明
This is an explaination of my code.
 - 题目1：完成上述 SwanLab 教程，可以通过 SwanLab 网站查看实验数据  
 实验数据和折线图如下图所示：  
 ![swablab_1.png](https://raw.githubusercontent.com/EmilyLiu03/inifigence_internship_swanlab-BERT/main/swablab_1.png)
 ![swanlab_2.png](https://raw.githubusercontent.com/EmilyLiu03/inifigence_internship_swanlab-BERT/main/swanlab_2.png)
 
 - 特殊说明  
 由于SSH远程访问huggingface的datasets失败，代码模型的实现是基于本地进行的。题目1、2相同，后续不再赘述。  
 Github上，因为文件过大等原因，暂无数据集和本地模型，下载链接可直接访问：  
 [imdb/data](http://ai.stanford.edu/~amaas/data/sentiment/)  
 [BERT/model](https://huggingface.co/google-bert/bert-base-chinese)

 # 代码详情

In [None]:
import torch
from datasets import load_dataset, Dataset, DatasetDict, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from swanlab.integration.transformers import SwanLabCallback
import swanlab
import pandas as pd
import os

# 定义加载本地IMDB数据集的函数
def load_imdb_dataset(data_dir):
    train_data = []
    test_data = []

    # 读取训练数据
    for label in ['pos', 'neg']:
        folder_path = os.path.join(data_dir, 'train', label)
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.txt'):
                with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                    train_data.append({'text': file.read(), 'label': 1 if label == 'pos' else 0})

    # 读取测试数据
    for label in ['pos', 'neg']:
        folder_path = os.path.join(data_dir, 'test', label)
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.txt'):
                with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                    test_data.append({'text': file.read(), 'label': 1 if label == 'pos' else 0})

    return DatasetDict({
        'train': Dataset.from_pandas(pd.DataFrame(train_data)),
        'test': Dataset.from_pandas(pd.DataFrame(test_data))
    })

# 加载本地IMDB数据集
data_dir = '/root/aclImdb' 
dataset = load_imdb_dataset(data_dir)

# 加载预训练的BERT tokenizer和模型
model_path = '/root/bert-base-uncased' 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

# 定义tokenize函数
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

# 对数据集进行tokenization
tokenized_datasets = dataset.map(tokenize, batched=True)

# 设置模型输入格式
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    logging_first_step=100,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to="none",
)

CLASS_NAME = {0: "negative", 1: "positive"}

# 设置swanlab回调函数
swanlab_callback = SwanLabCallback(project='BERT',
                                   experiment_name='BERT-IMDB',
                                   config={'dataset': 'IMDB', "CLASS_NAME": CLASS_NAME})

# 定义Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    callbacks=[swanlab_callback],
)

# 训练模型
trainer.train()

# 保存模型
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')

# 测试模型
test_reviews = [
    "I absolutely loved this movie! The storyline was captivating and the acting was top-notch. A must-watch for everyone.",
    "This movie was a complete waste of time. The plot was predictable and the characters were poorly developed.",
    "An excellent film with a heartwarming story. The performances were outstanding, especially the lead actor.",
    "I found the movie to be quite boring. It dragged on and didn't really go anywhere. Not recommended.",
    "A masterpiece! The director did an amazing job bringing this story to life. The visuals were stunning.",
    "Terrible movie. The script was awful and the acting was even worse. I can't believe I sat through the whole thing.",
    "A delightful film with a perfect mix of humor and drama. The cast was great and the dialogue was witty.",
    "I was very disappointed with this movie. It had so much potential, but it just fell flat. The ending was particularly bad.",
    "One of the best movies I've seen this year. The story was original and the performances were incredibly moving.",
    "I didn't enjoy this movie at all. It was confusing and the pacing was off. Definitely not worth watching."
]

model.to('cpu')
text_list = []
for review in test_reviews:
    label = predict(review, model, tokenizer, CLASS_NAME)
    text_list.append(swanlab.Text(review, caption=f"{label}-{CLASS_NAME[label]}"))

if text_list:
    swanlab.log({"predict": text_list})

swanlab.finish()