In [1]:
import os
import sys
import pandas as pd
import numpy as np
import time

from tqdm import tqdm

In [2]:
import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

sys.path.append("..")

from utils import DATA_DIR  # noqa

In [3]:
# BertのモデルとTokenizer(前処理用)をimport
from transformers import BertTokenizer, BertModel

In [5]:
start = time.time()

In [6]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [7]:
tweet_df = pd.read_csv(os.path.join(DATA_DIR, "cleaned_airline_tweets.csv"))


train, test = train_test_split(tweet_df, test_size=0.2,
                               random_state=0, stratify=tweet_df["sentiment"])
train, test = train.reset_index(drop=True), test.reset_index(drop=True)

In [8]:
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [9]:
train_dataloader = DataLoader(
    train["text"],
    batch_size=512,
    shuffle=False,
    collate_fn=lambda batch: bert_tokenizer(
        text=batch, padding="longest", truncation=True,
        return_tensors="pt"
    )
)

test_dataloader = DataLoader(
    test["text"],
    batch_size=512,
    shuffle=False,
    collate_fn=lambda batch: bert_tokenizer(
        text=batch, padding="longest", truncation=True,
        return_tensors="pt"
    )
)

In [10]:
train_emb_list = []

with torch.no_grad():
    for batch in tqdm(train_dataloader):
        outputs = bert_model(input_ids=batch["input_ids"].to(device),
                             attention_mask=batch["attention_mask"].to(device),
                             token_type_ids=batch["token_type_ids"].to(device))
        embeddings = outputs.pooler_output
        train_emb_list.append(embeddings.cpu().numpy())

train_emb = np.vstack(train_emb_list)


test_emb_list = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        outputs = bert_model(input_ids=batch["input_ids"].to(device),
                             attention_mask=batch["attention_mask"].to(device),
                             token_type_ids=batch["token_type_ids"].to(device))
        embeddings = outputs.pooler_output
        test_emb_list.append(embeddings.cpu().numpy())

test_emb = np.vstack(test_emb_list)

  return forward_call(*args, **kwargs)
100%|██████████| 7/7 [00:14<00:00,  2.01s/it]
100%|██████████| 2/2 [00:03<00:00,  1.53s/it]


In [11]:
pipe = Pipeline([
    ("clf", RandomForestClassifier(n_estimators=10000, max_depth=3, min_samples_split=3))
])

params = {
    "clf__n_estimators": [10000],
    "clf__max_depth": [5, 6],
    "clf__min_samples_split": [3]
}

pipe.fit(train_emb, train["sentiment"])
print(classification_report(test["sentiment"], pipe.predict(test_emb)))

              precision    recall  f1-score   support

    negative       0.63      0.57      0.60       243
     neutral       0.60      0.53      0.57       260
    positive       0.69      0.82      0.75       269

    accuracy                           0.65       772
   macro avg       0.64      0.64      0.64       772
weighted avg       0.64      0.65      0.64       772



In [13]:
print(time.time() - start)

211.3505790233612
