In [1]:
import os
import sys
import pandas as pd
import numpy as np
import time
import warnings

from tqdm import tqdm

warnings.filterwarnings("ignore")

In [None]:
import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

sys.path.append("../../")

from utils import DATA_DIR  # noqa

In [3]:
# BertのモデルとTokenizer(前処理用)をimport
from transformers import BertTokenizer, BertModel

In [5]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [6]:
tweet_df = pd.read_csv(os.path.join(DATA_DIR, "cleaned_airline_tweets.csv"))


train, test = train_test_split(tweet_df, test_size=0.2,
                               random_state=0, stratify=tweet_df["sentiment"])
train, test = train.reset_index(drop=True), test.reset_index(drop=True)

In [7]:
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
train_dataloader = DataLoader(
    train["text"],
    batch_size=512,
    shuffle=False,
    collate_fn=lambda batch: bert_tokenizer(
        text=batch, padding="longest", truncation=True,
        return_tensors="pt"
    )
)

test_dataloader = DataLoader(
    test["text"],
    batch_size=512,
    shuffle=False,
    collate_fn=lambda batch: bert_tokenizer(
        text=batch, padding="longest", truncation=True,
        return_tensors="pt"
    )
)

In [9]:
train_emb_list = []

with torch.no_grad():
    for batch in tqdm(train_dataloader):
        outputs = bert_model(input_ids=batch["input_ids"].to(device),
                             attention_mask=batch["attention_mask"].to(device),
                             token_type_ids=batch["token_type_ids"].to(device))
        embeddings = outputs.pooler_output
        train_emb_list.append(embeddings.cpu().numpy())

train_emb = np.vstack(train_emb_list)


test_emb_list = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        outputs = bert_model(input_ids=batch["input_ids"].to(device),
                             attention_mask=batch["attention_mask"].to(device),
                             token_type_ids=batch["token_type_ids"].to(device))
        embeddings = outputs.pooler_output
        test_emb_list.append(embeddings.cpu().numpy())

test_emb = np.vstack(test_emb_list)

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:08<00:00,  1.25s/it]
100%|██████████| 2/2 [00:01<00:00,  1.11it/s]


In [10]:
pipe = Pipeline([
    ("clf", LogisticRegression(max_iter=10000))
])
LogisticRegression()
params = {
    "clf__C": [.1, 1, 10],
    "clf__solver": ["lbfgs", "saga"]
}

grid = GridSearchCV(pipe, params, cv=5, verbose=2)

grid.fit(train_emb, train["sentiment"])
print(classification_report(test["sentiment"], grid.predict(test_emb)))

              precision    recall  f1-score   support

    negative       0.85      0.86      0.85       243
     neutral       0.81      0.82      0.82       260
    positive       0.88      0.86      0.87       269

    accuracy                           0.85       772
   macro avg       0.85      0.85      0.85       772
weighted avg       0.85      0.85      0.85       772



In [13]:
logreg = LogisticRegression(max_iter=10000, C=1, solver="saga")
logreg.fit(train_emb, train["sentiment"])
print(classification_report(test["sentiment"], logreg.predict(test_emb)))

              precision    recall  f1-score   support

    negative       0.85      0.86      0.85       243
     neutral       0.81      0.82      0.82       260
    positive       0.88      0.86      0.87       269

    accuracy                           0.85       772
   macro avg       0.85      0.85      0.85       772
weighted avg       0.85      0.85      0.85       772



In [11]:
print(time.time() - start)

269.13468408584595


In [12]:
grid.best_params_

{'clf__C': 1, 'clf__solver': 'saga'}