<a href="https://colab.research.google.com/github/GaeunHome/BiLSTML_Sentiment-Analysis/blob/main/BiLSTM_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from google.colab import drive

# 掛載 Google 雲端硬碟
drive.mount('/content/drive')

# 讀取資料集
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/train_filtered_file.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/validation_filtered_file.csv')

# 處理缺失值，確保 'text' 列為字串類型
train_df['text'] = train_df['text'].fillna('').astype(str)
val_df['text'] = val_df['text'].fillna('').astype(str)

# 標籤編碼：將 'sentiment' 轉換為數字編碼
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['sentiment'])
val_labels = label_encoder.transform(val_df['sentiment'])

# 文本序列處理
max_words = 10000  # 語料庫中最多出現的詞語數量
max_len = 100      # 每個文本序列的最大長度

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['text'])  # 用訓練集建立詞典

# 將文本轉換為序列
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
val_sequences = tokenizer.texts_to_sequences(val_df['text'])

# 將序列填充或截斷到固定長度
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')

# 建立模型
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))  # 文字嵌入層
model.add(Bidirectional(LSTM(64)))  # 雙向 LSTM 層
model.add(Dropout(0.5))             # Dropout 層，防止過擬合
model.add(Dense(3, activation='softmax'))  # 輸出層，對應三個類別

# 編譯模型
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# 訓練模型
history = model.fit(train_padded, train_labels,
                    epochs=5,
                    batch_size=32)

# 顯示模型摘要
model.summary()

# 模型訓練完成後，對驗證集進行預測
val_predictions = model.predict(val_padded)
val_pred_classes = np.argmax(val_predictions, axis=1)

# 計算驗證集的準確率
accuracy = accuracy_score(val_labels, val_pred_classes)
print(f'驗證集準確率: {accuracy * 100:.2f}%')

Mounted at /content/drive




Epoch 1/5
[1m1928/1928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 138ms/step - accuracy: 0.6450 - loss: 0.7808
Epoch 2/5
[1m1928/1928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 142ms/step - accuracy: 0.8610 - loss: 0.3520
Epoch 3/5
[1m1928/1928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 140ms/step - accuracy: 0.9052 - loss: 0.2392
Epoch 4/5
[1m1928/1928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 140ms/step - accuracy: 0.9257 - loss: 0.1862
Epoch 5/5
[1m1928/1928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 141ms/step - accuracy: 0.9384 - loss: 0.1523
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step
驗證集準確率: 96.74%
