## 一 数据处理
###### 主要是把本地和远端调取数据的方法给整理出来

### 准备工作
- !pip install datasets

In [None]:
from datasets import load_dataset
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import torch


### 1 下面是加载公开数据集


In [21]:
data_public = load_dataset('nanaaaa/emotion_chinese_english')

In [None]:
# {0:'joy', 1:'sadness', 2:'anger', 3:'fear', 4:'love'}

In [22]:
data_public

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence', 'label'],
        num_rows: 416
    })
    validation: Dataset({
        features: ['id', 'sentence', 'label'],
        num_rows: 54
    })
    test: Dataset({
        features: ['id', 'sentence', 'label'],
        num_rows: 46
    })
})

In [23]:
data_public['train'][:2]

{'id': [1, 2],
 'sentence': ['Here and there over the grass stood beautiful flowers like stars, and there were twelve peach-trees that in the spring- time broke out into delicate blossoms of pink and pearl, and in the autumn bore rich fruit. ',
  '他顿时感到一阵巨大的恐惧，他跟织工说：“你在织什么样的长袍？”'],
 'label': [0, 3]}

In [24]:
data_public['validation'][2]

{'id': 3,
 'sentence': '“尽管我不认识他，我天天夜里都给他唱歌,天天夜里我把他的故事讲给星星听，现在我看见他了。',
 'label': 4}

In [25]:
# 也可以只加载某一部分
data_public_test = load_dataset("nanaaaa/emotion_chinese_english", split="train")
data_public_test

Dataset({
    features: ['id', 'sentence', 'label'],
    num_rows: 416
})

In [None]:
# 切分数据集
data_public_test.train_test_split(test_size=0.1)

#### 1.1 高级的情感分析数据处理方法

In [26]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [27]:
def process_sample(sample):
    # 获取文本和情感标签
    text = sample["sentence"]
    label = sample["label"]

    # 对文本进行BERT编码
    encoded_text = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # 添加BERT编码后的文本到样本中
    sample["input_ids"] = encoded_text["input_ids"]
    sample["attention_mask"] = encoded_text["attention_mask"]

    # 添加情感标签到样本中
    # 这里假设情感标签是一个整数值，你可以根据实际情况进行调整
    sample["label_id"] = torch.tensor(label)

    return sample

In [28]:
processed_datasets = data_public_test.map(process_sample)

Map:   0%|          | 0/416 [00:00<?, ? examples/s]

In [29]:
processed_datasets

Dataset({
    features: ['id', 'sentence', 'label', 'input_ids', 'attention_mask', 'label_id'],
    num_rows: 416
})

In [None]:
processed_datasets[1]

#### 1.2 情感分析代码

#### 1.1 下面是封装方法
- 封装的不好

In [None]:
class Dataset(Dataset):
    def __init__(self, path, data_type):
        self.data = self.load_data(path, data_type)

    def load_data(self, path, data_type):
        tmp_dataset = load_dataset(path, split=data_type)
        Data = {}
        for idx, line in enumerate(tmp_dataset):
            sample = line
            Data[idx] = sample
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


In [None]:
train_data = Dataset(path='nanaaaa/emotion_chinese_english', data_type='train')

In [None]:
valid_data = Dataset(path='nanaaaa/emotion_chinese_english', data_type='validation')
test_data = Dataset(path='nanaaaa/emotion_chinese_english', data_type='test')

In [None]:
train_data[0:1]

In [33]:
tokenizer('尽管我不认识他，我天天夜里都给他唱歌,天天夜里我把他的故事讲给星星听，现在我看见他', padding="max_length", truncation=True, max_length=128, return_tensors="pt")

{'input_ids': tensor([[ 101, 2226, 5052, 2769,  679, 6371, 6399,  800, 8024, 2769, 1921, 1921,
         1915, 7027, 6963, 5314,  800, 1548, 3625,  117, 1921, 1921, 1915, 7027,
         2769, 2828,  800, 4638, 3125,  752, 6382, 5314, 3215, 3215, 1420, 8024,
         4385, 1762, 2769, 4692, 6224,  800,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [32]:
torch.tensor(3)

tensor(3)

In [None]:
!pip install datasets