# 實作

## 管線方式

### 文本分類

In [None]:
from transformers import pipeline
nlp_sentence_classif= pipeline("sentiment-analysis") 	#自動載入模型
print(nlp_sentence_classif ("I like this book!"))		#呼叫模型進行處理

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998674392700195}]


### 特徵提取

In [None]:
import numpy as np
nlp_features = pipeline('feature-extraction')
output = nlp_features(
           'Code Doctor Studio is a Chinese company based in BeiJing.')
print(np.array(output).shape)   #輸出特征形狀

No model was supplied, defaulted to distilbert-base-cased and revision 935ac13 (https://huggingface.co/distilbert-base-cased).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

(1, 16, 768)


### 完形填空

In [None]:
nlp_fill = pipeline("fill-mask")
print(nlp_fill.tokenizer.mask_token) #輸出遮蔽字元：'[MASK]'
#呼叫模型進行處理
print(nlp_fill(f"Li Jinhong wrote many {nlp_fill.tokenizer.mask_token} about artificial intelligence technology and helped many people."))	

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

<mask>
[{'score': 0.5444343090057373, 'token': 2799, 'token_str': ' books', 'sequence': 'Li Jinhong wrote many books about artificial intelligence technology and helped many people.'}, {'score': 0.3202725946903229, 'token': 7201, 'token_str': ' articles', 'sequence': 'Li Jinhong wrote many articles about artificial intelligence technology and helped many people.'}, {'score': 0.02494569681584835, 'token': 27616, 'token_str': ' essays', 'sequence': 'Li Jinhong wrote many essays about artificial intelligence technology and helped many people.'}, {'score': 0.021165847778320312, 'token': 6665, 'token_str': ' papers', 'sequence': 'Li Jinhong wrote many papers about artificial intelligence technology and helped many people.'}, {'score': 0.018288157880306244, 'token': 22064, 'token_str': ' blogs', 'sequence': 'Li Jinhong wrote many blogs about artificial intelligence technology and helped many people.'}]


### 閱讀理解

In [None]:
nlp_qa = pipeline("question-answering") 		#案例化模型
print(										#輸出模型處理結果
  nlp_qa(context='Code Doctor Studio is a Chinese company based in BeiJing.',
           question='Where is Code Doctor Studio?') )

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.9549620151519775, 'start': 49, 'end': 56, 'answer': 'BeiJing'}


### 摘要生成

該管線的預設模型是"bert-large-cnn"，但Transformers函數庫中，還沒有TensorFlow版的BERT預先編譯模型，所以需要手動指定一個支援TensorFlow架構的摘要產生模型。這裡使用的[T5模型](https://cloud.tencent.com/developer/article/1537682)，是Text-to-Text模型

In [None]:
TEXT_TO_SUMMARIZE = '''
In this notebook we will be using the transformer model, first introduced in this paper. Specifically, we will be using the BERT (Bidirectional Encoder Representations from Transformers) model from this paper.
Transformer models are considerably larger than anything else covered in these tutorials. As such we are going to use the transformers library to get pre-trained transformers and use them as our embedding layers. We will freeze (not train) the transformer and only train the remainder of the model which learns from the representations produced by the transformer. In this case we will be using a multi-layer bi-directional GRU, however any model can learn from these representations.
'''
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small",
                     framework="tf") # 使用TensorFlow框架
print(summarizer(TEXT_TO_SUMMARIZE,min_length=5, max_length=150))

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Your max_length is set to 150, but you input_length is only 149. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=74)


[{'summary_text': 'in this notebook we will be using the transformer model, first introduced in this paper . we will use the transformers library to get pre-trained transformers .'}]


### 手動加載模型

在pipeline類的初始化接口中，可以直接指定加載模型的路徑，從本地預訓練模型文件進行載入。

但有一個前提條件: 所要載入的預訓練模型文件，必須使用固定的文件名稱。

在pipeline類接口中，預訓練模型文件，是以套為單位的。每套預訓練模型文件的組成，及其固定的文件名稱如下:
* 詞表文件: 以.txt、.model或.json為擴展名，存放模型中使用的詞表文件
* 詞表擴展文件(可選): 以.txt為擴展名，補充原有的詞表文件
* 配置文件: 以.json為擴展名，存放模型的超參數配置
* 權重文件: 以.h5為擴展名，存放模型中，各個參數的具體值

In [None]:
from transformers.models.auto.processing_auto import AutoTokenizer, AutoConfig
# 指定NLP任務對應的字符串
config = AutoConfig.from_pretrained(r'./t5-small/t5-small-config.json')
tokenizer = AutoTokenizer.from_pretrained(r'./t5-small', config=config) # 指定資料夾，會自動從資料夾載入所需的檔案

# 指定本地模型
nlp_sentence_classif = pipeline('summarization', # 任務
                model = r'./t5-small/t5-small-tf_model.h5', # 加載的模型
                config = config,
                tokenizer = tokenizer 
                                )

### 實體詞識別

In [None]:
nlp_token_class = pipeline("ner")
print(nlp_token_class(
        'Code Doctor Studio is a Chinese company based in BeiJing.'))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-ORG', 'score': 0.99824834, 'index': 1, 'word': 'Code', 'start': 0, 'end': 4}, {'entity': 'I-ORG', 'score': 0.9986155, 'index': 2, 'word': 'Doctor', 'start': 5, 'end': 11}, {'entity': 'I-ORG', 'score': 0.99831945, 'index': 3, 'word': 'Studio', 'start': 12, 'end': 18}, {'entity': 'I-MISC', 'score': 0.99704957, 'index': 6, 'word': 'Chinese', 'start': 24, 'end': 31}, {'entity': 'I-LOC', 'score': 0.9869552, 'index': 10, 'word': 'Be', 'start': 49, 'end': 51}, {'entity': 'I-LOC', 'score': 0.96116525, 'index': 11, 'word': '##i', 'start': 51, 'end': 52}, {'entity': 'I-LOC', 'score': 0.9435933, 'index': 12, 'word': '##J', 'start': 52, 'end': 53}, {'entity': 'I-LOC', 'score': 0.95518416, 'index': 13, 'word': '##ing', 'start': 53, 'end': 56}]


## 用BERT實現完形填空

### 載入詞表

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, BertForMaskedLM, TFAutoModelWithLMHead

#載入預訓練模型 tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
#輸入文字
text = "[CLS] Who is Li Jinhong ? [SEP] Li Jinhong is a programmer [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['[CLS]', 'Who', 'is', 'Li', 'Jin', '##hong', '?', '[SEP]', 'Li', 'Jin', '##hong', 'is', 'a', 'programmer', '[SEP]']


### 遮蔽單詞

In [None]:
masked_index = 8 #遮罩一個標示，用' BertForMaskedLM '預測回來
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)

['[CLS]', 'Who', 'is', 'Li', 'Jin', '##hong', '?', '[SEP]', '[MASK]', 'Jin', '##hong', 'is', 'a', 'programmer', '[SEP]']


In [None]:
# 將標示轉為詞彙表索引
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# 將輸入轉為張量
tokens_tensor = tf.constant([indexed_tokens])
print(tokens_tensor)

tf.Tensor(
[[  101  2627  1110  5255 10922 15564   136   102   103 10922 15564  1110
    170 23981   102]], shape=(1, 15), dtype=int32)


### 加載預訓練模型

In [None]:
# 載入預訓練模型 (weights)
model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased')

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


### 進行預測

In [None]:
# 段標記索引，標記輸入文本中的第一句和第二句。
# 0對應第一句，1對應第二句
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] # 第一句有8個單詞，第二句有7個單詞
segments_tensors = tf.constant([segments_ids])

In [None]:
# 預測所有的tokens
# output = model(tokens_tensor)
outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    
predictions = outputs[0]  #[1, 19, 30522]，30522是詞表中，詞的個數。輸出的結果，表示詞表中，每個單字在句子中，可能出現的機率

predicted_index = tf.argmax(predictions[0, masked_index]) # 取出[MASK]對應的預測索引值
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #轉成單字
print('Predicted token is:',predicted_token)

Predicted token is: Li


## 使用GPT2進行文本生成

我們使用Transformers庫中的[GPT-2](https://www.gushiciku.cn/pl/gade/zh-tw)模型，實現下一詞的預測功能，通過循環生成下一詞，實現將一句話補充完整。

### 載入詞表

In [None]:
# 安裝transformers

!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 60.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 51.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

# 載入預訓練模型（權重）
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
#解碼輸入
indexed_tokens = tokenizer.encode("Who is Li Jinhong ? Li Jinhong is a")

print( tokenizer.decode(indexed_tokens))

Who is Li Jinhong? Li Jinhong is a


In [None]:
indexed_tokens

[8241, 318, 7455, 17297, 71, 506, 5633, 7455, 17297, 71, 506, 318, 257]

In [None]:
tokens_tensor = tf.constant([indexed_tokens])#轉為張量
print(tokens_tensor)

tf.Tensor(
[[ 8241   318  7455 17297    71   506  5633  7455 17297    71   506   318
    257]], shape=(1, 13), dtype=int32)


### 加載預訓練模型

In [None]:
# 載入預訓練模型（權重）
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
model

<transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel at 0x7f1d05bc2890>

In [None]:
# 預測所有標示

outputs = model(tokens_tensor)
predictions = outputs[0]#(1, 13, 50257)
predictions.shape

TensorShape([1, 13, 50257])

In [None]:
# 得到預測的下一詞
predicted_index = tf.argmax(predictions[0, -1, :])
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
print(predicted_text)

Who is Li Jinhong? Li Jinhong is a young


### 生成句子

In [None]:
#產生一段完整的話
stopids = tokenizer.convert_tokens_to_ids(["."])[0]  # 定義結束字符
past_key_values = None # 定義模型參數
for i in range(100):

    output = model(tokens_tensor, past_key_values=past_key_values) # 預測下一詞。模型開啟連續預測狀態
    token = tf.argmax(output.logits[..., -1, :],axis= -1)

    past_key_values = output.past_key_values
    indexed_tokens += token.numpy().tolist()

    if stopids== token.numpy()[0]:
        break
    tokens_tensor = token[None,:] #增加一個維度
    
sequence = tokenizer.decode(indexed_tokens) # 進行字符串解碼

print(sequence)

Who is Li Jinhong? Li Jinhong is a young man who is a member of the Li Clan.


GPT模型的使用範例，也可以參考[Kaggle](https://www.kaggle.com/code/vimalpillai/finetuning-gpt2-model-tensorflow)。

## 遷移BERT對中文分類

Transformers函數庫中，提供了大量的預訓練模型，這些模型都是在通用資料集中，訓練出來的。它們並不能適用於實際工作中的NLP任務。

如果要根據自己的文字資料，來訓練模型，則還需要用"遷移學習"的方式，對預訓練模型，進行微調。

### 載入套件

In [None]:
import tensorflow as tf
from transformers import (
        BertTokenizer,
        TFAutoModelForSequenceClassification,
        AutoConfig
        )
import os

### 樣本資料

本例的數據集包含從[THUCNews](https://github.com/649453932/Chinese-Text-Classification-Pytorch/tree/master/THUCNews/data)數據集中，隨機抽取的20萬條新聞標題，每個樣本長度為20-30，一共10個類別，每類2萬條新聞標題。

* 類別被放在class.txt中
* 訓練數據集: 18萬條，放在train.txt中
* 測試數據集: 1萬條，放在test.txt中
* 驗證數據集: 1萬條，放在dec.txt中

每條樣本，分為兩個部分:
* 文本字符串
* 所屬的類別標籤索引

其中的類別標籤索引，對應於class.txt中的類別順序。

In [None]:
# 加載數據集
!git clone https://github.com/649453932/Chinese-Text-Classification-Pytorch.git

Cloning into 'Chinese-Text-Classification-Pytorch'...
remote: Enumerating objects: 215, done.[K
remote: Total 215 (delta 0), reused 0 (delta 0), pack-reused 215[K
Receiving objects: 100% (215/215), 42.09 MiB | 40.05 MiB/s, done.
Resolving deltas: 100% (118/118), done.


In [None]:
# 變更預設路徑
%cd ./Chinese-Text-Classification-Pytorch

/content/Chinese-Text-Classification-Pytorch


In [None]:
# 加載類別名稱
data_dir='./THUCNews/data' #定義資料集根目錄

class_list = [x.strip() for x in open(
        os.path.join(data_dir, "class.txt")).readlines()]
len(class_list)

10

### 加載預訓練模型

In [None]:
# tokenizer = BertTokenizer.from_pretrained(r'./bert-base-chinese/bert-base-chinese-vocab.txt')
pretrained_weights = 'bert-base-chinese' # 建立模型
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

#定義指定分類別的組態檔
# config = AutoConfig.from_pretrained(
#         r'./bert-base-chinese/bert-base-chinese-config.json',num_labels=len(class_list)) 

config = AutoConfig.from_pretrained(pretrained_weights,num_labels=len(class_list)) 
#起始化模型，單獨指定config，在config中指定分類別個數
# model = TFAutoModelForSequenceClassification.from_pretrained(
#        r'./bert-base-chinese/bert-base-chinese-tf_model.h5',
#         config=config)
model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_weights,config=config)

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 建立資料集

In [None]:
def read_file(path): #讀取資料集檔案內容
    with open(path, 'r', encoding="UTF-8") as file:
        docus = file.readlines()
        newDocus = []
        labs = []
        for data in docus:
            content, label = data.split('\t')
            label = int(label)
            newDocus.append(content)
            labs.append(label)
            
    ids = tokenizer.batch_encode_plus( newDocus,
                #！！！！！模型的組態檔中就是512，當有超過這個長度的會顯示出錯
                max_length=model.config.max_position_embeddings,  
                pad_to_max_length=True)#,return_tensors='tf')#沒有return_tensors會傳回list！！！！
  
    return (ids["input_ids"],ids["attention_mask"],labs)

在使用詞表工具時，指定模型的設定檔中的最大長度model.config.max_position_embeddings。在本實例中，該長度為512，當輸入文字大於這個長度，則會被自動截斷。

In [None]:
#獲得訓練集和測試集

trainContent = read_file(os.path.join(data_dir, "train.txt")) 
testContent = read_file(os.path.join(data_dir, "test.txt"))
len(trainContent)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


3

In [None]:
def getdataset(features): #定義函數，封裝資料集
    
    def gen():              #定義產生器
        for ex in zip(features[0],features[1],features[2]):
            yield (
                {
                    "input_ids": ex[0],
                    "attention_mask": ex[1],
                },
                ex[2],
            )  
      
    return tf.data.Dataset.from_generator( #傳回資料集
                gen,
                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
                (
                    {
                        "input_ids": tf.TensorShape([None]),
                        "attention_mask": tf.TensorShape([None]),
                    },
                    tf.TensorShape([]),
                ),
            ) 

In [None]:
#製作資料集    
valid_dataset = getdataset(testContent) 
train_dataset = getdataset(trainContent) 
#設定批次
train_dataset = train_dataset.shuffle(100).batch(8).repeat(2)
valid_dataset = valid_dataset.batch(16)

In [None]:
for count_batch in train_dataset.take(2):
    print(count_batch)

({'input_ids': <tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[ 101, 1957, 2094, ...,    0,    0,    0],
       [ 101, 2126, 5661, ...,    0,    0,    0],
       [ 101, 2548, 2054, ...,    0,    0,    0],
       ...,
       [ 101,  915, 5384, ...,    0,    0,    0],
       [ 101, 7032, 6395, ...,    0,    0,    0],
       [ 101, 1101, 2548, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}, <tf.Tensor: shape=(8,), dtype=int64, numpy=array([5, 4, 7, 9, 6, 6, 2, 9])>)
({'input_ids': <tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[ 101, 3119, 4669, ...,    0,    0,    0],
       [ 101,  517, 1506, ...,    0,    0,    0],
       [ 101, 3862, 3895, ...,    0,    0,    0],
       ...,
       [ 101,  517, 

### 訓練模型

In [None]:
#定義改善器
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) # clipnorm限制傳播過程中，梯度的變化範圍
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

請開啟大量RAM。

In [None]:
#訓練模型
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

#儲存模型
savedir = r'./myfinetun-bert_chinese/'
os.makedirs(savedir, exist_ok=True)
model.save_pretrained(savedir)

Epoch 1/2
Epoch 2/2
