In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam

In [None]:
import pandas as pd

dataset_file = r"Sentiment_classification_training_dataset.xlsx"
df = pd.read_excel(dataset_file)
df["label"].value_counts()

In [3]:
text = df["text"]
label = df["label"]

In [None]:
# 保留前幾常見的詞
max_words = 20000  
tokenizer = Tokenizer(num_words=max_words)  

# Tokenization，統計現有文字，建立 token 字典
tokenizer.fit_on_texts(text)    

# 用此 token 字典將文字轉換成編號列表 
sequences = tokenizer.texts_to_sequences(text)

print(sequences[0])

In [None]:
# 最多保留 128 個詞，超出截斷，不足則以 0 補齊
max_len = 128   

# padding or truncate 都從後面做
X_text = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
print(X_text[0])

In [None]:
y_label = to_categorical(label, num_classes=3)

print(f"{label[0]} => {y_label[0]}")
print(f"{label[500]} => {y_label[500]}")
print(f"{label[3400]} => {y_label[3400]}")

In [None]:
train_text, validation_text, train_label, validation_label = train_test_split(
    X_text, y_label, test_size=0.25, random_state=42, stratify=label)

print(train_text[0], train_label[0])

In [21]:
model = Sequential([
    # 要 Embedding 的詞彙表, Embedding 成多少維度 
    Embedding(input_dim=max_words, output_dim=128),
    Bidirectional(LSTM(128)),
    Dropout(0.2),
    Dense(3, activation='softmax')
])

In [22]:
model.compile(
    optimizer=Adam(),                     
    loss='categorical_crossentropy',      
    metrics=['accuracy']                  
)

In [None]:
history = model.fit(
    train_text, train_label,
    epochs=10,          
    batch_size=256,       
    validation_data=(validation_text, validation_label)
)

In [None]:
model.save("Bi-LSTM_SENTIMENT_TASK.h5")

In [27]:
token_json = tokenizer.to_json()
with open('bi_lstm_sentiment_task_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(token_json)

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

# 模型下載
model = load_model("Bi-LSTM_ESG_TASK.h5")

# Tokenizer 下載
with open('bi_lstm_esg_task_tokenizer.json', 'r', encoding='utf-8') as f:
    token_dict = json.load(f)
    
tokenizer = tokenizer_from_json(token_dict)

In [None]:
# 處理文本
test_text = ["This is testing text"]

seqence = tokenizer.texts_to_sequences(test_text)

X_new = pad_sequences(seqence, maxlen=128, padding='post', truncating='post')

# 推論
pred = model.predict(X_new)  
label = np.argmax(pred, axis=1)  