<a href="https://colab.research.google.com/github/GaeunHome/Bert_Sentiment-Analysis/blob/main/Bert_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch
!pip install scikit-learn

In [None]:
# 匯入必要的庫
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
import torch.nn.init as init
from sklearn.model_selection import train_test_split
import pandas as pd
from google.colab import drive
from sklearn.metrics import accuracy_score

# 使用GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'使用裝置: {device}')

# Google雲端硬碟
drive.mount('/content/drive')

# 1. 載入預訓練的BERT模型與Tokenizer，設定為3分類模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3分類情感分析
model.to(device)

# 2. 顯式初始化分類層的權重
def init_classifier_weights(model):
    # 檢查模型是否有分類器
    if hasattr(model, 'classifier'):
        classifier = model.classifier
        init.xavier_normal_(classifier.weight)  # Xavier初始化分類層的權重
        if classifier.bias is not None:
            init.zeros_(classifier.bias)  # 將bias初始化為零

init_classifier_weights(model)

# 3. 定義資料集類別
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # 包含特殊符號 [CLS], [SEP]
            max_length=self.max_len,
            padding='max_length',     # 使用最大長度進行填充
            truncation=True,          # 超過最大長度則截斷
            return_attention_mask=True,
            return_tensors='pt'       # 返回PyTorch張量
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),          # 輸入ID張量
            'attention_mask': encoding['attention_mask'].flatten(),# 注意力遮罩張量
            'label': torch.tensor(label, dtype=torch.long)         # 標籤張量
        }

'''
訓練資料預處理
'''

# 載入資料集 # training01
# df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/training01.csv', encoding='latin-1')

# 載入資料集 # training02
column_names = ['time', 'name', 'sentiment', 'text']
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/training02.csv', encoding='latin-1', header=None, names=column_names)

# 確認欄位名稱
# print(df.columns)

# 只保留需要的欄位
# df = df[['text', 'sentiment']] # training01
df = df[['sentiment', 'text']] # training02

# 篩選只保留正面、負面和中立的標籤
df_filtered = df[df['sentiment'].isin(['Positive', 'Negative', 'Neutral'])]

# 儲存過濾後的資料到新的 CSV 檔案
df_filtered.to_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/train_filtered_file.csv', index=False)

# 載入儲存過濾後的資料
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/train_filtered_file.csv')

# 將 Sentiment 映射為數字標籤 (positive: 1, neutral: 2, negative: 0)
# df['label'] = df['sentiment'].replace({'positive': 1, 'neutral': 2, 'negative': 0}) # training01.csv
df['label'] = df['sentiment'].replace({'Positive': 1, 'Neutral': 2, 'Negative': 0}) # train_filtered_file.csv
df['text'] = df['text'].fillna('')

# 查看資料框的前五行
# print(df[['text', 'label']].head())

# 查看資料欄位中的值有哪些
# print(df['label'].unique())

# 提取文本和標籤
texts = df['text'].values
labels = df['label'].values

# 5. 分割資料集為訓練集和驗證集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, random_state=42)

# 建立訓練集和驗證集的Dataset
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

# 建立訓練集和驗證集的DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# 6. 設定優化器和損失函數
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  # 可根據需要調整學習率
criterion = nn.CrossEntropyLoss()

# 7. 定義訓練和評估函數
def train(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)          # 輸入ID
        attention_mask = batch['attention_mask'].to(device)# 注意力遮罩
        labels = batch['label'].to(device)                 # 標籤

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss                                 # 計算損失
        loss.backward()                                     # 反向傳播
        optimizer.step()                                    # 更新參數

        total_loss += loss.item()

    return total_loss / len(data_loader)

def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)          # 輸入ID
            attention_mask = batch['attention_mask'].to(device)# 注意力遮罩
            labels = batch['label'].to(device)                 # 標籤

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss                                 # 計算損失
            logits = outputs.logits                             # 輸出結果

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)                 # 獲取預測結果
            total_correct += (preds == labels).sum().item()     # 計算正確數量
            total_samples += labels.size(0)                     # 總樣本數

    accuracy = total_correct / total_samples                    # 計算準確率
    return total_loss / len(data_loader), accuracy

# 8. 設定學習率調度器
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)  # 每2個epoch學習率乘以0.1

# 9. 開始訓練模型
num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss, val_accuracy = evaluate(model, val_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, 訓練損失: {train_loss:.4f}, 驗證損失: {val_loss:.4f}, 驗證準確率: {val_accuracy:.4f}')
    scheduler.step()  # 更新學習率

# 10. 測試模型
def predict(model, text):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # 包含特殊符號 [CLS], [SEP]
        max_length=128,
        padding='max_length',     # 使用最大長度進行填充
        truncation=True,          # 超過最大長度則截斷
        return_attention_mask=True,
        return_tensors='pt'       # 返回PyTorch張量
    )

    input_ids = encoding['input_ids'].to(device)              # 輸入ID
    attention_mask = encoding['attention_mask'].to(device)    # 注意力遮罩

    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = output.logits                                     # 輸出結果
    prediction = torch.argmax(logits, dim=1).item()            # 獲取預測結果
    return prediction

'''
測試資料預處理
'''

# 載入資料集 # validation02
column_names = ['time', 'name', 'sentiment', 'text']
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/validation02.csv', encoding='latin-1', header=None, names=column_names)

# 只保留需要的欄位
# df = df[['text', 'sentiment']] # training01
df = df[['sentiment', 'text']] # training02

# 篩選只保留正面、負面和中立的標籤
df_filtered = df[df['sentiment'].isin(['Positive', 'Negative', 'Neutral'])]

# 儲存過濾後的資料到新的 CSV 檔案
df_filtered.to_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/validation_filtered_file.csv', index=False)

def evaluate_model(model, df, tokenizer, device):
    model.eval()
    predictions = []
    true_labels = []

    for index, row in df.iterrows():
        text = row['text']
        true_label = row['sentiment']

        # 預測情感
        pred = predict(model, text)
        predictions.append(pred)

        # 根據情感標註轉換成數字形式
        if true_label == 'Positive':
            true_labels.append(1)
        elif true_label == 'Negative':
            true_labels.append(0)
        elif true_label == 'Neutral':
            true_labels.append(2)

    # 計算準確率
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

# 使用evaluate_model函數來計算準確率
accuracy = evaluate_model(model, df_filtered, tokenizer, device)
print(f"準確率: {accuracy * 100:.2f}%")

'''
此段為輸入內容，判斷該段話為Positive/Negative/Neutral
'''
# test_text = "Despite the challenges we faced in the project, the team's efforts were commendable, and we managed to complete everything on time, which was truly impressive."
# pred = predict(model, test_text)
# label_dict = {0: "負面", 1: "正面", 2: "中立"}
# print(f"輸入文本: {test_text}")
# print(f"預測結果: {pred} ({label_dict[pred]})")

使用裝置: cuda
Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  df['label'] = df['sentiment'].replace({'Positive': 1, 'Neutral': 2, 'Negative': 0}) # train_filtered_file.csv


                                                text  label
0  im getting on borderlands and i will murder yo...      1
1  I am coming to the borders and I will kill you...      1
2  im getting on borderlands and i will kill you ...      1
3  im coming on borderlands and i will murder you...      1
4  im getting on borderlands 2 and i will murder ...      1
[1 2 0]
Epoch 1/3, 訓練損失: 0.6096, 驗證損失: 0.3771, 驗證準確率: 0.8538
Epoch 2/3, 訓練損失: 0.2448, 驗證損失: 0.2363, 驗證準確率: 0.9109
Epoch 3/3, 訓練損失: 0.0965, 驗證損失: 0.2059, 驗證準確率: 0.9266
模型準確率: 97.58%
