In [1]:
import os
import sys
import pandas as pd
import numpy as np
import time
import warnings

from tqdm import tqdm

warnings.filterwarnings("ignore")

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


sys.path.append("../..")

from utils import DATA_DIR  # noqa

In [4]:
# BertのモデルとTokenizer(前処理用)をimport
from transformers import BertTokenizer, BertModel

In [5]:
class BertClf(nn.Module):
    def __init__(self, input_size=768, num_classes=3, dropout_rate=0.3):
        super().__init__()
        self.linear = nn.Linear(input_size, num_classes)

        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        output = self.dropout(x)
        output = self.linear(output)
        proba = F.softmax(output, dim=1)

        return output, proba

In [7]:
tweet_df = pd.read_csv(os.path.join(DATA_DIR, "cleaned_airline_tweets.csv"))
tweet_df["sentiment"] = tweet_df["sentiment"].replace({"negative": 0, "neutral": 1, "positive": 2})

train, test = train_test_split(tweet_df, test_size=0.2, random_state=0, stratify=tweet_df["sentiment"])
train, test = train.reset_index(drop=True), test.reset_index(drop=True)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else 
                      "mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [9]:
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
A = ["hello my name is chu. Nice to meet you"]
C = tokenizer(A)
for c in C:
    print(f"{c}: {C[c]}")

input_ids: [[101, 7592, 2026, 2171, 2003, 14684, 1012, 3835, 2000, 3113, 2017, 102]]
token_type_ids: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [11]:
train_dataloader = DataLoader(
    train["text"],
    batch_size=256,
    shuffle=False,
    collate_fn=lambda batch: tokenizer(
        text=batch,
        padding="longest",
        truncation=True,
        return_tensors="pt",
        max_length=128
    )
)

test_dataloader = DataLoader(
    test["text"],
    batch_size=256,
    shuffle=False,
    collate_fn=lambda batch: tokenizer(
        text=batch,
        padding="longest",
        truncation=True,
        return_tensors="pt",
        max_length=128
    )
)

In [12]:
train_emb_list = []
test_emb_list = []
bert_model.eval()
with torch.no_grad():
    for batch in tqdm(train_dataloader):
        outputs = bert_model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            token_type_ids=batch["token_type_ids"].to(device)
        )
        embedding = outputs.pooler_output
        train_emb_list.append(embedding)
        torch.mps.empty_cache()

train_emb = torch.vstack(train_emb_list)
train_emb_label = torch.tensor(train["sentiment"]).to(device)
train_dataset = TensorDataset(train_emb, train_emb_label)
train_loader = DataLoader(train_dataset, batch_size=len(train_dataset) // 10, shuffle=True)


test_emb_list = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        outputs = bert_model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            token_type_ids=batch["token_type_ids"].to(device)
        )
        embedding = outputs.pooler_output
        test_emb_list.append(embedding)
        torch.mps.empty_cache()

test_emb = torch.vstack(test_emb_list)
test_emb_label = torch.tensor(test["sentiment"])

100%|██████████| 13/13 [00:08<00:00,  1.60it/s]
100%|██████████| 4/4 [00:01<00:00,  2.11it/s]


In [15]:
for batch_emb, batch_labels in train_loader:
    print(batch_emb.requires_grad)

False
False
False
False
False
False
False
False
False
False
False


In [27]:
bert_clf = BertClf().to(device)
optimizer = optim.AdamW(bert_clf.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

num_epochs = 1001
bert_clf.train()
for epoch in tqdm(range(num_epochs), "Traning Progress"):
    total_loss = 0
    num_batches = 0

    for batch_emb, batch_labels in train_loader:
        batch_emb = batch_emb.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()

        output, proba = bert_clf(batch_emb)
        loss = criterion(output, batch_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    if (epoch + 1) % 100 == 0:
        avg_loss = total_loss / num_batches
        
        print(avg_loss)
        
        

Traning Progress:  10%|█         | 102/1001 [00:05<00:45, 19.59it/s]

0.9239274155009877


Traning Progress:  20%|██        | 202/1001 [00:10<00:40, 19.58it/s]

0.847103785384785


Traning Progress:  30%|███       | 302/1001 [00:15<00:34, 20.28it/s]

0.8041879317977212


Traning Progress:  40%|████      | 403/1001 [00:20<00:29, 20.20it/s]

0.7668921405618842


Traning Progress:  50%|█████     | 503/1001 [00:26<00:27, 17.94it/s]

0.742383295839483


Traning Progress:  60%|██████    | 603/1001 [00:32<00:22, 17.87it/s]

0.7469585808840665


Traning Progress:  70%|███████   | 703/1001 [00:37<00:16, 18.21it/s]

0.6985193436796014


Traning Progress:  80%|████████  | 803/1001 [00:43<00:10, 19.58it/s]

0.7074138630520214


Traning Progress:  90%|█████████ | 904/1001 [00:48<00:04, 20.00it/s]

0.705519134348089


Traning Progress: 100%|██████████| 1001/1001 [00:53<00:00, 18.83it/s]

0.6799466962164099





In [28]:
bert_clf.eval()
with torch.no_grad():
    train_output, train_proba = bert_clf(train_emb)
    train_pred = torch.argmax(train_proba, dim=1).cpu().numpy()

    test_output, test_proba = bert_clf(test_emb)
    test_pred = torch.argmax(test_proba, dim=1).cpu().numpy()

print("\n=== Training Set Results ===")
print(classification_report(train["sentiment"].values, train_pred, 
                           target_names=["negative", "neutral", "positive"]))

print("\n=== Test Set Results ===")
print(classification_report(test["sentiment"].values, test_pred, 
                           target_names=["negative", "neutral", "positive"]))


=== Training Set Results ===
              precision    recall  f1-score   support

    negative       0.76      0.74      0.75       972
     neutral       0.73      0.70      0.71      1039
    positive       0.81      0.85      0.83      1077

    accuracy                           0.77      3088
   macro avg       0.77      0.77      0.76      3088
weighted avg       0.77      0.77      0.77      3088


=== Test Set Results ===
              precision    recall  f1-score   support

    negative       0.76      0.75      0.76       243
     neutral       0.76      0.71      0.73       260
    positive       0.81      0.86      0.83       269

    accuracy                           0.78       772
   macro avg       0.78      0.78      0.77       772
weighted avg       0.78      0.78      0.78       772



In [18]:
from sklearn.linear_model import LogisticRegression

train_emb_cpu = train_emb.cpu().numpy()
test_emb_cpu = test_emb.cpu().numpy()
# 比較用：scikit-learnのLogisticRegression
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(train_emb_cpu, train["sentiment"])
print("\n=== Comparison: Scikit-learn LogisticRegression ===")
print(classification_report(test["sentiment"].values, logreg.predict(test_emb_cpu), target_names=["negative", "neutral", "positive"]))


=== Comparison: Scikit-learn LogisticRegression ===
              precision    recall  f1-score   support

    negative       0.85      0.86      0.85       243
     neutral       0.81      0.82      0.82       260
    positive       0.88      0.86      0.87       269

    accuracy                           0.85       772
   macro avg       0.85      0.85      0.85       772
weighted avg       0.85      0.85      0.85       772

