##雲端硬碟設定指向

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ls 'drive'

'My Drive'  'Shared drives'


In [3]:
import os
path = "/content/drive/My Drive/Colab Notebooks/Bert/AML/"
os.chdir(path)

In [4]:
!ls

albert	data.zip   trained_model   trained_model4  名字分類.ipynb
data	NER.ipynb  trained_model2  分類250.ipynb   名字預測.ipynb


In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

##下載Albert並放在albert目錄

In [6]:
!git clone https://github.com/harry83528/albert-zh-for-pytorch-transformers.git albert

fatal: destination path 'albert' already exists and is not an empty directory.


In [7]:
import torch
from torch.utils.data import TensorDataset
import pickle

In [8]:
import sys 
sys.path.append('.')
sys.path

['',
 '/env/python',
 '/usr/lib/python36.zip',
 '/usr/lib/python3.6',
 '/usr/lib/python3.6/lib-dynload',
 '/usr/local/lib/python3.6/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.6/dist-packages/IPython/extensions',
 '/root/.ipython',
 '.']

## 定義函式-選擇模型並加載設定

In [9]:
def use_model(model_name, config_file_path, model_file_path, vocab_file_path, num_labels):
    # 選擇模型並加載設定
    if(model_name == 'bert'):
        from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
        model_config, model_class, model_tokenizer = (BertConfig, BertForSequenceClassification, BertTokenizer)
        config = model_config.from_pretrained(config_file_path,num_labels = num_labels)
        model = model_class.from_pretrained(model_file_path, from_tf=bool('.ckpt' in 'bert-base-chinese'), config=config)
        tokenizer = model_tokenizer(vocab_file=vocab_file_path)
        return model, tokenizer
    elif(model_name == 'albert'):
        from albert.albert_zh import AlbertConfig, AlbertTokenizer, AlbertForSequenceClassification
        model_config, model_class, model_tokenizer = (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
        config = model_config.from_pretrained(config_file_path,num_labels = num_labels)
        model = model_class.from_pretrained(model_file_path, config=config)
        tokenizer = model_tokenizer.from_pretrained(vocab_file_path)
        return model, tokenizer

## 定義函式-準確率計算

In [10]:
def compute_accuracy(y_pred, y_target):
    # 計算正確率
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

## 定義函式-快速建立BERT INPUT( tokenizer.build_inputs_with_special_tokens() )

In [11]:
def to_bert_ids(tokenizer,q_input):
    # 將文字輸入轉換成對應的id編號
    #快速建立BERT INPUT
    return tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q_input)))

## 定義函式-轉換成tensor格式,並且建立dataset

In [12]:
#轉換成tensor格式,並且建立dataset
def make_dataset(input_ids, input_masks, input_segment_ids, answer_lables):
    all_input_ids = torch.tensor([input_id for input_id in input_ids], dtype=torch.long)
    all_input_masks = torch.tensor([input_mask for input_mask in input_masks], dtype=torch.long)
    all_input_segment_ids = torch.tensor([input_segment_id for input_segment_id in input_segment_ids], dtype=torch.long)
    all_answer_lables = torch.tensor([answer_lable for answer_lable in answer_lables], dtype=torch.long)    
    return TensorDataset(all_input_ids, all_input_masks, all_input_segment_ids, all_answer_lables)

In [13]:
from torch.utils.data import DataLoader
import torch

In [14]:
class DataDic(object):
    def __init__(self, answers):
        self.answers = answers #全部答案(含重複)
        self.answers_norepeat = sorted(list(set(answers))) # 不重複
        self.answers_types = len(self.answers_norepeat) # 總共多少類
        self.ans_list = [] # 用於查找id或是text的list
        self._make_dic() # 製作字典
    
    def _make_dic(self):
        for index_a,a in enumerate(self.answers_norepeat):
            if a != None:
                self.ans_list.append((index_a,a))

    def to_id(self,text):
        for ans_id,ans_text in self.ans_list:
            if text == ans_text:
                return ans_id

    def to_text(self,id):
        for ans_id,ans_text in self.ans_list:
            if id == ans_id:
                return ans_text

    @property
    def types(self):
        return self.answers_types
    
    @property
    def data(self):
        return self.answers

    def __len__(self):
        return len(self.answers)

In [15]:
# load and init
pkl_file = open('trained_model4/data_features.pkl', 'rb')
data_features = pickle.load(pkl_file)
answer_dic = data_features['answer_dic']

In [16]:
!pip install -U ckiptagger[tfgpu,gdown]

Collecting ckiptagger[gdown,tfgpu]
  Downloading https://files.pythonhosted.org/packages/5d/24/9ee7289b423345bc1705453437d9b0d9e93a015fbc00885c6033c2f50fab/ckiptagger-0.1.1-py3-none-any.whl
Collecting tensorflow-gpu<2,>=1.13.1; extra == "tfgpu"
[?25l  Downloading https://files.pythonhosted.org/packages/98/ab/19aba3629427c2d96790f73838639136ce02b6e7e1c4f2dd60149174c794/tensorflow_gpu-1.15.3-cp36-cp36m-manylinux2010_x86_64.whl (411.0MB)
[K     |████████████████████████████████| 411.0MB 38kB/s 
Collecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Collecting tensorflow-estimator==1.15.1
[?25l  Downloading https://files.pythonhosted.org/packages/de/62/2ee9cd74c9fa2fa450877847ba560b260f5d0fb70ee0595203082dafcc9d/tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503kB)
[K     |████████████████████████████████| 512kB 35.0MB/s 
Collecting tensorboard<1.16.0,>=1.15.0
[?25l  Downloadi

In [17]:
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER

In [18]:
# 使用 GPU：
#    1. 安裝 tensorflow-gpu (請見安裝說明)
#    2. 設定 CUDA_VISIBLE_DEVICES 環境變數，例如：os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#    3. 設定 disable_cuda=False，例如：ws = WS("./data", disable_cuda=False)
# 使用 CPU：
ws = WS("./data", disable_cuda=False)
pos = POS("./data", disable_cuda=False)
ner = NER("./data", disable_cuda=False)

In [19]:
def get_person_str(context):
  sentence_list = [context]

  word_sentence_list = ws(
      sentence_list,
      # sentence_segmentation = True, # To consider delimiters
      # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters
      # recommend_dictionary = dictionary1, # words in this dictionary are encouraged
      # coerce_dictionary = dictionary2, # words in this dictionary are forced
  )

  pos_sentence_list = pos(word_sentence_list)
  entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
  #print(entity_sentence_list)

  set_entity=set()    
  for i, sentence in enumerate(sentence_list):
      #print()
      #print(f"'{sentence}'")
      #print_word_pos_sentence(word_sentence_list[i],  pos_sentence_list[i])
      for entity in sorted(entity_sentence_list[i]):
            if entity[2] == 'PERSON':
              #print(entity[3])
              set_entity.add(entity[3].replace(" ", ""))
  str_entity = ','.join(set_entity)
  #print(str_entity)
  return str_entity

In [22]:
context="〔記者楊國文／台北報導〕「台灣第一家」有限公司 ，被查出將砷含量超標的工業用碳酸鎂摻入胡椒粉、椒鹽粉等產品出售給超市、雜貨店，並供旗下「台灣第一家鹽酥雞」店面使用，一審依違反食安法的「食品添加物有毒而加工罪」，輕判創辦人陳廷智之子、總經理陳星佑2年徒刑，陳廷智長女即副總陳鏡如1年，僅沒收1588萬犯罪所得；高等法院則痛斥陳星佑、陳鏡如兩人犯罪近兩年，無悔意，加重分別改判為2年半、2年，並將銷售額1億1595萬元均視為犯罪所得須沒收，陳廷智仍無罪。可上訴。判決指出，陳廷智2001年間就已將經營權交給子女，未實際管事，只將胡椒粉配方傳給兒子陳星佑，兒子掌握產品生產業務、長女陳鏡如負責財務及會計事務。請繼續往下閱讀...2007年間，陳星佑得知碳酸鎂可避免胡椒粉受潮，一公斤食用碳酸鎂約220元、工業用碳酸鎂約46至53元，成本有3到4倍落差，於是向純佳公司購買允成化工生產的「允成鹽基性碳酸鎂A-102碳酸鎂」，貨品上明明註明「ForIndustrialUseOnly（僅限工業使用）」，仍然添加。2014年食安法修法後，陳星佑仍以1比20比例，把工業用碳酸鎂加入胡椒粉、椒鹽粉等產品出售給超市、雜貨店，另供旗下「台灣第一家鹽酥雞」店面使用。新北市政府衛生局將查扣的碳酸鎂送驗，發現砷含量達「7.69ppm、7.83ppm」超過4ppm標準，屬有毒食品添加物。新北檢方起訴認定，台灣第一家不法所得達1億多元，但新北地院認定有落差，從食安法2014年修法後算起，共1588萬餘元。高等法院則認定，陳星佑、陳鏡如犯罪時間近兩年，所販售椒盬粉等16項食品總售額高達1億1595萬多元都是犯罪所得，因此均須沒收， 或追徵。"
return_value=get_person_str(context)
return_value_list=return_value.split(",")
print(type(return_value_list),return_value_list)

<class 'list'> ['陳星佑', '陳廷智', '陳鏡如', '楊國文']


In [26]:
model_setting2 = {
    "model_name":"albert", 
    "config_file_path":"trained_model4/config.json", 
    "model_file_path":"trained_model4/pytorch_model.bin", 
    "vocab_file_path":"albert/albert_tiny/vocab.txt",
    "num_labels":2 # 分幾類
}    
model, tokenizer = use_model(**model_setting2)
model.eval()

#q_inputs = return_value_list
q_inputs = ['徐金龍']
for q_input in q_inputs:
    bert_ids = to_bert_ids(tokenizer,q_input)
    print('len(bert_ids)',len(bert_ids))
    assert len(bert_ids) <= 512
    input_ids = torch.LongTensor(bert_ids).unsqueeze(0)

    # predict
    outputs = model(input_ids)
    predicts = outputs[:2]
    predicts = predicts[0]
    max_val = torch.max(predicts)
    label = (predicts == max_val).nonzero().numpy()[0][1]
    ans_label = answer_dic.to_text(label)
    
    print(q_input)
    print(ans_label)
    print()

len(bert_ids) 5
徐金龍
0

