In [None]:
import torch
import numpy as np
from datasets import load_dataset

### 加载数据集

In [None]:
dataset = load_dataset('seamew/ChnSentiCorp', split='train')

dataset

In [None]:
# 查看一个数据
dataset[0]

### 排序(sort)

In [None]:
# 未排序的label是乱序的
print(dataset['label'][:10])

# 排序之后label有序了
sorted_dataset = dataset.sort('label')
print(sorted_dataset['label'][:10])
print(sorted_dataset['label'][-10:])

### 打乱(shuffle)

In [None]:
# 打乱顺序
shuffled_dataset = sorted_dataset.shuffle(seed=42)
shuffled_dataset['label'][:10]

### 选择

In [None]:
dataset.select([0, 10, 20, 30, 40, 50])

### 过滤

In [None]:
def f(data):
    return data['text'].startswith('选择')  # 返回布尔值

start_with_ar = dataset.filter(f)
len(start_with_ar), start_with_ar['text']

### 切分

In [None]:
# train_test_split，切分训练集和测试集
dataset.train_test_split(test_size=0.1)

### 分桶(shard)

In [None]:
# 把数据分到4个桶中，均匀分配
dataset.shard(num_shards=4, index=0)    # index: 返回第几个桶

### 列操作

In [None]:
# 列重命名
print(dataset.rename_column('text', 'textA'))
# 列移除
print(dataset.remove_columns(['text']))

### 类型转换

In [None]:
shuffled_dataset.set_format(type='torch', columns=['label'])
shuffled_dataset['label']

### map

In [None]:
def m(data):
    data['text'] = 'My sentence: ' + data['text']
    return data # 不return则不会改变

dataset_map = dataset.map(m)

dataset_map['text'][:5]

### 保存和加载

In [None]:
from datasets import load_from_disk

# dataset.save_to_disk('./dataset/ChnSentiCorp')

dataset = load_from_disk('./dataset/ChnSentiCorp')

# 导出为其他格式
dataset.to_csv('./dataset/ChnSentiCorp.csv')
dataset.to_json('./dataset/ChnSentiCorp.json')

dataset