# 环境安装

## 查看torch版本

In [1]:
import torch
import torchvision
import torchaudio

# 打印版本信息
print("torch 版本:", torch.__version__)
print("torchvision 版本:", torchvision.__version__)
print("torchaudio 版本:", torchaudio.__version__)

# 检查 CUDA 是否可用
print("CUDA 是否可用:", torch.cuda.is_available())

# 如果 CUDA 可用，打印当前 CUDA 版本
if torch.cuda.is_available():
    print("CUDA 版本:", torch.version.cuda)

torch 版本: 2.3.1+cu118
torchvision 版本: 0.18.1+cu118
torchaudio 版本: 2.3.1+cu118
CUDA 是否可用: True
CUDA 版本: 11.8


## 导入必要库

In [2]:
import transformers
import torchtext
import pandas
import sklearn
import logging
import random
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import os

print(torchtext.__version__)
print(transformers.__version__)
print(pandas.__version__)
print(sklearn.__version__)

  from .autonotebook import tqdm as notebook_tqdm


0.18.0+cpu
4.49.0
2.2.3
1.6.1


# 数据预处理和加载

## 数据读取函数

In [3]:
def read_wiki2(file_path=None, seps='.'):
    """
    本函数的作用是格式化原始的wikitext-2数据集
    下载地址为：https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
    :param filepath:
    :return: 最终的返回形式为一个二维list，外层list中的每个元素为一个段落；内层list中每个元素为一个段落所有句子的集合。
            [ [sentence 1, sentence 2, ...], [sentence 1, sentence 2,...],...,[] ]
    该返回结果也是一个标准的格式，后续若需要载入其它数据集（包括中文），只需要首先将数据集处理成这样的格式；
    并在类LoadBertPretrainingDataset的get_format_data()方法中加入所实现的预处理函数即可完成整个预训练数据集的构造。
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()   # 一次读取所有行，每一行为一个段落
    # 1. 大写字母转换为小写字母
    # 2. 只取每一个段落中有至少两句话的段，因为后续要构造next sentence
    paragraphs = []
    for line in tqdm(lines, ncols=80, desc=" ## 正在读取原始数据"):
        if len(line.split(' . ')) < 2:
            continue
        line = line.strip() # 去掉首尾空格换行符
        paragraphs.append([line[0]])   # 将每一行的第一个字符作为列表添加
        for w in line[1:]:
            if paragraphs[-1][-1][-1] in seps:  # 查看每一句话的最后一个字符是否为'.', 是句号则新加一句话
                paragraphs[-1].append(w)
            else:
                paragraphs[-1][-1] += w     # 不是句号，则在末尾继续添加字符
    random.shuffle(paragraphs)  # 将所有段落打乱
    return paragraphs

In [7]:
with open('./data/WikiText/wiki.train.tokens') as f:
    lines = f.readlines()

paragraphs = []
line = lines[4]
line = line.strip()
print(line)

wiki_texts = read_wiki2('./data/WikiText/wiki.train.tokens')
wiki_texts[0]   # 包含不同句的段落

The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcomers . Character designer <unk> Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa . A large team of writers handled the script . The game 's opening theme was sung by May 'n .


 ## 正在读取原始数据: 100%|████████████| 36718/36718 [00:01<00:00, 27994.40it/s]


['Cultural attitudes toward gender roles , contraception , and sexual activity vary greatly around the world , and range from extremely conservative to extremely liberal .',
 ' But in places where condoms are misunderstood , <unk> , <unk> , or looked upon with overall cultural disapproval , the prevalence of condom use is directly affected .',
 ' In less @-@ developed countries and among less @-@ educated populations , <unk> about how disease transmission and conception work negatively affect the use of condoms ; additionally , in cultures with more traditional gender roles , women may feel uncomfortable demanding that their partners use condoms .']

In [10]:
def read_songci(file_path=None, seps='。'):
    """
    本函数的作用是格式化原始的ci.song.xxx.json数据集
    下载地址为：https://github.com/chinese-poetry/chinese-poetry
    掌柜在此感谢该仓库的作者维护与整理
    :param filepath:
    :return: 返回和 read_wiki2() 一样形式的结果
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()   # 一次性读取所有行，每一行为一首词
    paragraphs = []
    for line in tqdm(lines, ncols=80, desc="## 正在读取原始数据"):
        if "□" in line or "……" in line or len(line.split('。')) < 2:
            continue
        paragraphs.append([line[0]])
        line = line.strip()  # 去掉换行符和两边的空格
        for w in line[1:]:
            if paragraphs[-1][-1][-1] in seps:
                paragraphs[-1].append(w)
            else:
                paragraphs[-1][-1] += w
    random.shuffle(paragraphs)  # 将所有段落打乱
    return paragraphs

In [13]:
with open('./data/SongCi/ci.song.0.json') as f:
    lines = f.readlines()

paragraphs = []
line = lines[4]
line = line.strip()
print(line)

songci_texts = read_songci('./data/SongCi/ci.song.0.json')
songci_texts[1]   # 包含不同句的段落

"气和玉烛，睿化著鸿明。",


## 正在读取原始数据: 100%|████████████| 14601/14601 [00:00<00:00, 585999.34it/s]


[' 免使年少，光阴虚过。', '"']