In [76]:
# Transformers installation
! pip install transformers datasets accelerate
#
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'



# Fine-tune a pretrained model

## 取得資料集 ChnSentiCorp_htl_all.csv

In [77]:
import os
import urllib.request

url="https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv"
#設定儲存的檔案路徑及名稱
filepath="ChnSentiCorp_htl_all.csv"
# 判斷檔案是否存在，若不存在才下載
if not os.path.isfile(filepath):
    # 下載檔案
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

### 讀取ChnSentiCorp_htl_all.csv資料集

In [78]:
import pandas as pd
pd_all = pd.read_csv('ChnSentiCorp_htl_all.csv')

print('評論數目（全部）：%d' % pd_all.shape[0])
print('評論數目（正面）：%d' % pd_all[pd_all.label==1].shape[0])
print('評論數目（負面）：%d' % pd_all[pd_all.label==0].shape[0])

評論數目（全部）：7766
評論數目（正面）：5322
評論數目（負面）：2444


### 修改欄位名稱

In [79]:
pd_all.rename(columns={"label":"labels", "review":"text"}, inplace=True)

In [80]:
pd_all.head()

Unnamed: 0,labels,text
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [81]:
print(pd_all["text"].apply(type).value_counts())
print(pd_all["text"].isna().sum())
pd_all = pd_all.dropna(subset=["text"])

text
<class 'str'>      7765
<class 'float'>       1
Name: count, dtype: int64
1


In [82]:
print('評論數目（全部）：%d' % pd_all.shape[0])
print('評論數目（正面）：%d' % pd_all[pd_all.labels==1].shape[0])
print('評論數目（負面）：%d' % pd_all[pd_all.labels==0].shape[0])

評論數目（全部）：7765
評論數目（正面）：5322
評論數目（負面）：2443


### pandas 轉成 pytorch Dataset

In [83]:
from datasets import Dataset
#del ds_all
ds_all=Dataset.from_pandas(pd_all)

print(len(ds_all))
print(ds_all[100])

7765
{'labels': 1, 'text': '意外，入住时给了早餐券和一张50元餐券。用完自助早餐后，中午在旋转餐厅用了个行政套餐，还余3块钱，可惜不退给我。不错，酒店包两餐，社会主义啊。', '__index_level_0__': 100}


### 將資料分成訓練集和測試集

In [84]:
import torch
import datasets
ds_train, ds_test = torch.utils.data.random_split(ds_all, [4000, 3765])
ds_all1 = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(pd.DataFrame(ds_all[ds_train.indices])),
    "test": datasets.Dataset.from_pandas(pd.DataFrame(ds_all[ds_test.indices]))
})

### 轉換資料集輸入資料

In [85]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

def tokenize_function(examples):
    return tokenizer(list(examples["text"]), padding="max_length", truncation=True)

tokenized_datasets = {x:ds_all1[x].map(tokenize_function, batched=True) for x in["train","test"]}

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

### 訓練集及驗證集個取出2000筆資料進行測試

In [86]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(2000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(2000))

In [87]:
s = small_train_dataset[0]
print(s.keys())
for key in s:
  print(key, s[key])

dict_keys(['labels', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'])
labels 1
text 总体还可以！但是网络太次了，对于商务出差的很受影响。公众网都很慢，别提公司的VPN帐户了，根本就是时断时续，而且只能收不能发！在大连头一次碰到这种事情！
__index_level_0__ 1719
input_ids [101, 2600, 860, 6820, 1377, 809, 8013, 852, 3221, 5381, 5317, 1922, 3613, 749, 8024, 2190, 754, 1555, 1218, 1139, 2345, 4638, 2523, 1358, 2512, 1510, 511, 1062, 830, 5381, 6963, 2523, 2714, 8024, 1166, 2990, 1062, 1385, 4638, 100, 2362, 2787, 749, 8024, 3418, 3315, 2218, 3221, 3198, 3171, 3198, 5330, 8024, 5445, 684, 1372, 5543, 3119, 679, 5543, 1355, 8013, 1762, 1920, 6825, 1928, 671, 3613, 4821, 1168, 6821, 4905, 752, 2658, 8013, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

<a id='trainer'></a>

## 訓練

In [88]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 訓練超參數

In [89]:
#from transformers import TrainingArguments

#training_args = TrainingArguments(output_dir="test_trainer")

In [90]:
! pip install datasets==2.0



### 評量標準

In [91]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [92]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [93]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")



### Trainer

In [94]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

使用train()函式訓練模型

In [95]:
import wandb
wandb.init(mode='disabled')

In [96]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.331306,0.8735
2,No log,0.362743,0.8775
3,No log,0.440067,0.8905


TrainOutput(global_step=75, training_loss=0.30368807474772136, metrics={'train_runtime': 227.1968, 'train_samples_per_second': 2.641, 'train_steps_per_second': 0.33, 'total_flos': 157866633216000.0, 'train_loss': 0.30368807474772136, 'epoch': 3.0})

### 儲存模型權重

In [97]:
torch.save(model.state_dict(), './my_model1')

### 還原模型

In [98]:
import torch
import transformers
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

# 選擇 GPU 或 CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 準備 tokenizer 及 模型架構
tokenizer1 = AutoTokenizer.from_pretrained('bert-base-chinese')
model1 = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)

# 載入模型權重
model1.load_state_dict(torch.load('./my_model1'))
model1.eval()
# 將模型搬到 GPU/CPU
model1.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model1.load_state_dict(torch.load('./my_model1'))


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [99]:
test_inputs = ["自然語言處理太難 眾多學生無奈 深夜崩潰暴哭 」"]

In [100]:
test_embeddings = tokenizer1(test_inputs, padding="max_length", truncation=True, return_tensors='pt').to(device)

In [101]:
out = model1(**test_embeddings)

In [102]:
output = '負面評論' if torch.argmax(out.logits) == 0 else '正面評論'

In [103]:
output, out

('負面評論',
 SequenceClassifierOutput(loss=None, logits=tensor([[ 1.0236, -1.9761]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None))

In [104]:
!pip install gradio



In [105]:
def test_model(test_inputs):
  test_embeddings = tokenizer1(test_inputs, padding="max_length", truncation=True, return_tensors='pt').to(device)
  out = model1(**test_embeddings)
  output = '這是負面評論' if torch.argmax(out.logits) == 0 else '這是正面評論'
  return output

In [106]:
import gradio as gr

iface = gr.Interface(
    fn=test_model,
    inputs="text",
    outputs="text"
)

iface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7317b024cacc1bd56d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://7317b024cacc1bd56d.gradio.live


