In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import codecs
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

max_length=128 # 句子的最大長度，padding要用的 (max=512)

In [None]:
!pip install transformers

In [None]:
from transformers import (
  AutoTokenizer,
  TFAutoModelForSequenceClassification,
)

In [None]:
df = pd.read_excel('/content/drive/MyDrive/comment_6k.xlsx')
json_data = df.to_json(orient='records')
with open('/content/drive/MyDrive/comment_6k.json', 'w') as json_file:
    json_file.write(json_data)

In [None]:
trained_model = TFAutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/JJ')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

In [None]:
!pip install iNLP
from inlp.convert import chinese # 簡轉繁套件

In [None]:
def get_data():
    '''
    讀取數據的函數
    :return: list  類型的 數據

    如果要訓練簡體模型,輸入簡體文字可以不轉檔
    '''
    pos = []
    neg = []
    with codecs.open('/content/drive/MyDrive/02NLP/pos.txt','r','utf-8') as reader: # pos.txt neg.txt 是簡體文,如果要訓練簡體模型,輸入簡體文字可以不轉檔
        for line in reader:
          pos.append(chinese.s2t(line.strip())) # strip() 方法用於移除字串頭尾指定的字元 # chinese.s2t(pos) 簡轉繁

    with codecs.open('/content/drive/MyDrive/02NLP/neg.txt','r','utf-8') as reader:
        for line in reader:
          neg.append(chinese.s2t(line.strip()))

    df_pos = pd.DataFrame(pos, columns=["text"])
    df_pos['label'] = 1
    df_neg = pd.DataFrame(neg, columns=["text"])
    df_neg['label'] = 0
    all_data = pd.concat([df_pos, df_neg])
    print(all_data) # all_data has columns=["text", 'label']!!!
    return all_data

In [None]:
def split_dataset(df):
    train_set, x = train_test_split(df,
        stratify=df['label'],
        test_size=0.1,
        random_state=42)
    val_set, test_set = train_test_split(x,
        stratify=x['label'],
        test_size=0.5,
        random_state=43)

    return train_set, val_set, test_set

def convert_example_to_feature(review):
    return tokenizer(review,
            add_special_tokens = True, # add [CLS], [SEP]
            padding="max_length", truncation=True, max_length=max_length, # 128
            return_attention_mask = True, # add attention mask to not focus on pad tokens
            )  # (or return_tensors="np" returns NumPy arrays; "pt" returns PyTorch tensors; "tf" returns TensorFlow tensors)

# map to the expected input to TFBertForSequenceClassification, see here
# https://yang10001.yia.app/wp/2021/05/23/tensorflow%EF%BC%9Atf-data-dataset-%E7%9A%84%E7%94%A8%E6%B3%95-%E4%B8%80/
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def encode_examples(ds):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    for index, row in ds.iterrows():
        review = row[0]
        label = row["label"]
        bert_input = convert_example_to_feature(review) # a dict with 3 keys

        input_ids_list.append(bert_input['input_ids']) # 2-d list
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])

    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [None]:
batch_size = 32
all_data = get_data()
train_data, val_data, test_data = split_dataset(all_data)
# train dataset
ds_train_encoded = encode_examples(train_data).shuffle(2000).batch(batch_size)
# val dataset
ds_val_encoded = encode_examples(val_data).batch(batch_size)
# test dataset
ds_test_encoded = encode_examples(test_data).batch(batch_size)

In [None]:
import os
import keras
model_dir = 'lab-logs2/models/'
os.makedirs(model_dir)


log_dir = os.path.join('lab-logs', 'model-1')
model_cbk = keras.callbacks.TensorBoard(log_dir=log_dir)
model_mckp = keras.callbacks.ModelCheckpoint(model_dir + '/BertFirstEdition.h5',
                                            monitor='val_binary_accuracy',
                                            save_best_only=True,
                                            mode='max')

In [None]:
# # optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1)

# # we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
trained_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# # fit model
bert_history = trained_model.fit(ds_train_encoded, epochs=3, validation_data=ds_val_encoded, callbacks=[model_cbk, model_mckp])

# evaluate test set
trained_model.evaluate(ds_test_encoded)

In [None]:
trained_model.evaluate(x_test, y_test, batch_size = 16)

In [None]:
# Saving from hard-drive
model_save_path='./bert_hotel_longcomment'
trained_model.save_pretrained(model_save_path, saved_model=True)
# del model

# Loading from hard-drive
trained_model = TFAutoModelForSequenceClassification.from_pretrained(model_save_path)
# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
trained_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
while True:
  #aplhaDict = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
  is_alpha = False
  text = input("輸入欲判斷之中文句子:")
  # for i in text:
  #   if i in aplhaDict:
  #     is_alpha = True
  #     print(i.isalpha())
  #     break
  # if is_alpha == True:
  #   print('請輸入僅含中文之句子')
  #   print(is_alpha)
  #   is_alpha = False
  #   print(is_alpha)
  #   continue
  try:
    int(text)
    print("請輸入中文句子")
  except:
    bert_input = tokenizer(text,
                add_special_tokens = True, # add [CLS], [SEP]
                padding="max_length", truncation=True, max_length=max_length, # 128
                return_attention_mask = True, # add attention mask to not focus on pad tokens
                return_tensors = 'tf') ###
    prediction = trained_model(bert_input, training=False)
    prediction.logits
    prediction_probs = tf.nn.softmax(prediction.logits, axis=1).numpy()
    if prediction_probs[0][0] >= prediction_probs[0][1]:
      print('此言論有%f機率為負面言論:' % prediction_probs[0][0])
    else:
      print('此言論有%f機率為正面言論::' % prediction_probs[0][1])

In [None]:
# Saving from hard-drive
model_save_path='./bert_model_chi2'
model.save_pretrained(model_save_path, saved_model=True)
# del model

# Loading from hard-drive
trained_model = TFAutoModelForSequenceClassification.from_pretrained(model_save_path)
# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
trained_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])