In [2]:
from datasets import *

# datasets 基本使用

## 加载在线数据集

In [1]:
datasets = load_dataset("madao33/new-title-chinese")
datasets

NameError: name 'load_dataset' is not defined

## 加载数据集合集中的某一项任务

In [None]:
boolq_dataset = load_dataset("super_glue", "boolq")
boolq_dataset

## 按照数据集划分进行加载

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split="train")
dataset

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split="train[10:100]")
dataset

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split="train[:50%]")
dataset

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "train[50%:]"])
dataset

## 查看数据集

In [None]:
datasets = load_dataset("madao33/new-title-chinese")
datasets

In [None]:
datasets["train"][0]

In [None]:
datasets["train"][:2]

In [None]:
datasets["train"]["title"][:5]

In [None]:
datasets["train"].column_names

In [None]:
datasets["train"].features

## 数据集划分

In [None]:
dataset = datasets["train"]
dataset.train_test_split(test_size=0.1)

In [None]:
dataset = boolq_dataset["train"]
dataset.train_test_split(test_size=0.1, stratify_by_column="label")     # 分类数据集可以按照比例划分

## 数据选取与过滤

In [None]:
# 选取
datasets["train"].select([0, 1])

In [None]:
# 过滤
filter_dataset = datasets["train"].filter(lambda example: "中国" in example["title"])

In [None]:
filter_dataset["title"][:5]

## 数据映射

In [None]:
# 每条数据前面拼接上 Perfix:
def add_prefix(example):
    example["title"] = 'Prefix: ' + example["title"]
    return example

.map() 的作用是：将你提供的一个函数（在你的例子中是 add_prefix），依次应用到数据集（datasets）中的每一个样本上，然后返回一个经过处理后的新数据集。

In [None]:
# 通过map调用
prefix_dataset = datasets.map(add_prefix)
# 展示前10个title
prefix_dataset["train"][:10]["title"]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def preprocess_function(example, tokenizer=tokenizer):
    model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
    labels = tokenizer(example["title"], max_length=32, truncation=True)
    # label就是title编码的结果
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
processed_datasets = datasets.map(preprocess_function)
processed_datasets

In [None]:
processed_datasets = datasets.map(preprocess_function, num_proc=4)
processed_datasets

In [None]:
processed_datasets = datasets.map(preprocess_function, batched=True)
processed_datasets

In [None]:
processed_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)
processed_datasets

## 保存与加载

In [None]:
processed_datasets.save_to_disk("./processed_data")

In [None]:
processed_datasets = load_from_disk("./processed_data")
processed_datasets

# 加载本地数据集

## 直接加载文件作为数据集

In [None]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset

In [None]:
dataset = Dataset.from_csv("./ChnSentiCorp_htl_all.csv")
dataset

## 加载文件夹内全部文件作为数据集

In [None]:
dataset = load_dataset("csv", data_files=["./all_data/ChnSentiCorp_htl_all.csv", "./all_data/ChnSentiCorp_htl_all copy.csv"], split='train')
dataset

## 通过预先加载的其他格式转换加载数据集

In [3]:
import pandas as pd

data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
data.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


- 根据文档参数说明，feature应为Features对象
    def from_pandas(
        cls,
        df: pd.DataFrame,
        features: Optional[Features] = None,
        info: Optional[DatasetInfo] = None,
        split: Optional[NamedSplit] = None,
        preserve_index: Optional[bool] = None,
    ) -> "Dataset":
-
-
- Feature对象如何构建
    class Features(dict):       # Feature继承自dict类型
        def __init__(*args, **kwargs):      # 任意数量不带关键字参数，任意数量带关键字参数
            # self not in the signature to allow passing self as a kwarg
            if not args:
                raise TypeError("descriptor '__init__' of 'Features' object needs an argument")

-
-
- *args, **kwargs关键字解析
def master_function(p1, p2, *args, **kwargs):
    print(f"标准参数 p1: {p1}")
    print(f"标准参数 p2: {p2}")
    print(f"位置参数 *args: {args}")
    print(f"关键字参数 **kwargs: {kwargs}")

master_function(1, 2, 'a', 'b', 'c', status="OK", user_id=123)
输出:
标准参数 p1: 1
标准参数 p2: 2
位置参数 *args: ('a', 'b', 'c')   # 不带关键字参数
关键字参数 **kwargs: {'status': 'OK', 'user_id': 123}

In [9]:
from datasets.features import Features


# todo:直接将panda处理成dataset格式
# 这个Feature得是csv的列名一致才行
dataset = Dataset.from_pandas(data,Features({
    'label': Value('string'),
    'review': Value('string')
}))
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

In [10]:
# List格式的数据需要内嵌{}，明确数据字段
data = [{"text": "abc"}, {"text": "def"}]
# data = ["abc", "def"]
Dataset.from_list(data)

Dataset({
    features: ['text'],
    num_rows: 2
})

## 通过自定义加载脚本加载数据集

In [None]:
load_dataset("json", data_files="./cmrc2018_trial.json", field="data")

In [None]:
dataset = load_dataset("./load_script.py", split="train")
dataset

In [None]:
dataset[0]

# Dataset with DataCollator

In [None]:
from transformers import  DataCollatorWithPadding

In [None]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split='train')
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

In [None]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

In [None]:
tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset

In [None]:
print(tokenized_dataset[:3])

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader

In [None]:
dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=collator, shuffle=True)

In [None]:
num = 0
for batch in dl:
    print(batch["input_ids"].size())
    num += 1
    if num > 10:
        break