In [1]:
# # カーネルリスタートの時はこのセルを実行しなくてもOK
# !wget https://bootstrap.pypa.io/get-pip.py
# !python get-pip.py
# %pip install tokenizers fugashi ipadic accelerate==0.20.3 seaborn
# %pip install transformers datasets scikit-learn
# !wget https://github.com/ids-cv/wrime/raw/master/wrime-ver1.tsv

# %pip install -U imbalanced-learn

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AdamW, get_linear_schedule_with_warmup

In [30]:
df_wrime = pd.read_table('wrime-ver1.tsv')
emotion_names = ['Joy', 'Sadness', 'Anticipation', 'Surprise', 'Anger', 'Fear', 'Disgust', 'Trust']
emotion_names_jp = ['喜び', '悲しみ', '期待', '驚き', '怒り', '恐れ', '嫌悪', '信頼']
num_labels = len(emotion_names)

df_wrime['readers_emotion_intensities'] = df_wrime.apply(lambda x: [x['Avg. Readers_' + name] for name in emotion_names], axis=1)

# removing samples with less emotion intensities
# (max.readers_emotion_intensities must be 2 or more)
is_target = df_wrime['readers_emotion_intensities'].map(lambda x: max(x) >= 2)
df_wrime_target = df_wrime[is_target]

In [32]:
def update_dataframe(df):
    if 'readers_emotion_intensities' not in df.columns:
        raise ValueError("DataFrame does not contain 'readers_emotion_intensities' column.")
    
    # DataFrameをコピーして新しいDataFrameを作成
    updated_df = df.copy()
    
    # 'readers_emotion_intensities'列の各要素を更新
    for index, row in updated_df.iterrows():
        max_value = max(row['readers_emotion_intensities'])
        updated_df.at[index, 'readers_emotion_intensities'] = [int(value == max_value) for value in row['readers_emotion_intensities']]
    
    return updated_df


In [33]:
df_wrime_target_updated = update_dataframe(df_wrime_target)

In [34]:
# readers_emotion_intensities 列に1が2回以上登場する行を削除する
df_wrime_target_updated_filtered = df_wrime_target_updated[df_wrime_target_updated['readers_emotion_intensities'].apply(lambda x: x.count(1) < 2)]

In [35]:
# 感情ラベルを保存するための空のリストを作成
emotion_labels = []

# 各行に対して処理を行う
for index, row in df_wrime_target_updated_filtered.iterrows():
    # 1が格納されているインデックスを取得し、対応する感情ラベルを取得する
    emotions = [emotion_names_jp[i] for i, val in enumerate(row['readers_emotion_intensities']) if val == 1]
    # 感情ラベルがない場合は空文字列を追加する
    if len(emotions) == 0:
        emotion_labels.append('')
    else:
        # 複数の感情がある場合はカンマ区切りの文字列に変換して追加する
        emotion_labels.append(', '.join(emotions))

# 新しい感情カラムをデータフレームに追加する
df_wrime_target_updated_filtered['emotion'] = emotion_labels


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wrime_target_updated_filtered['emotion'] = emotion_labels


In [36]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# sm = SMOTE()
# x_resampled, y_resampled = sm.fit_sample(x_train, y_train)

# 各感情ラベルごとのサンプル数をカウント
emotion_counts = df_wrime_target_updated_filtered['emotion'].value_counts()

# アンダーサンプリングとオーバーサンプリングのためのサンプル数の計算
max_samples = emotion_counts.min()  # 最小のサンプル数を基準にする
min_samples = emotion_counts.min()

# アンダーサンプリング
rus = RandomUnderSampler(sampling_strategy={label: min_samples for label in emotion_counts.index})
undersampled_X, undersampled_y = rus.fit_resample(df_wrime_target_updated_filtered['Sentence'].values.reshape(-1, 1), df_wrime_target_updated_filtered['emotion'])

# オーバーサンプリング
ros = RandomOverSampler(sampling_strategy={label: min_samples for label in emotion_counts.index})
oversampled_X, oversampled_y = ros.fit_resample(undersampled_X, undersampled_y)

# データフレームに変換
df_wrime_target_updated_filtered_oversampled = pd.DataFrame({'Sentence': oversampled_X.flatten(), 'emotion': oversampled_y})

# シャッフル
df_wrime_target_updated_filtered_oversampled = df_wrime_target_updated_filtered_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

# 結果の表示
print(df_wrime_target_updated_filtered_oversampled['emotion'].value_counts())


emotion
恐れ     45
信頼     45
期待     45
喜び     45
悲しみ    45
嫌悪     45
驚き     45
怒り     45
Name: count, dtype: int64


In [37]:
# df_wrime_target_updated_filtered_oversampled の Sentence カラムに存在する Sentence データの集合を作成
existing_sentences = set(df_wrime_target_updated_filtered_oversampled['Sentence'])

# df_wrime_target の Sentence カラムに含まれるエントリが df_wrime_target_updated_filtered_oversampled の Sentence カラムに存在しない行を削除
df_wrime_target_updated_filtered_processed = df_wrime_target_updated_filtered[df_wrime_target_updated_filtered['Sentence'].isin(existing_sentences)]


In [47]:
# emotion 列を削除する
df_wrime_target_updated_filtered_processed.drop(columns=['emotion'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wrime_target_updated_filtered_processed.drop(columns=['emotion'], inplace=True)


In [48]:
# 使用するモデルを指定して、Tokenizerを読み込む
checkpoint = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [49]:
# Divide into train, validation, and test sets
train_data, test_valid_data = train_test_split(df_wrime_target_updated_filtered_processed, test_size=0.4, random_state=42)
valid_data, test_data = train_test_split(test_valid_data, test_size=0.5, random_state=42)

In [50]:
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch['Sentence'], truncation=True, padding='max_length', return_tensors="pt")
    tokenized_batch['labels'] = [x / np.sum(x) for x in batch['readers_emotion_intensities']]
    return tokenized_batch

In [52]:
# Transformers用のデータセット形式に変換
# pandas.DataFrame -> datasets.Dataset
target_columns = ['Sentence', 'readers_emotion_intensities']
train_dataset = Dataset.from_pandas(train_data[target_columns])
valid_dataset = Dataset.from_pandas(valid_data[target_columns])
test_dataset = Dataset.from_pandas(test_data[target_columns])

# 前処理（tokenize_function） を適用
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
valid_tokenized_dataset = valid_dataset.map(tokenize_function, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [53]:
# https://huggingface.co/docs/transformers/training
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    label_ids = np.argmax(labels, axis=-1)
    return metric.compute(predictions=predictions, references=label_ids)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [54]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# %pip install accelerate transformers[torch] -U


In [74]:
# Transformers の Trainer を用いる
# https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/trainer#transformers.TrainingArguments

# 訓練時にerror → 上のコードセルをコメントアウト → 実行 → (仮想環境をdeactivate + restart vscode)もしくは(カーネルリスタート) → 上のコードセルをコメントアウト → run all the cells againで解決

# 訓練時の設定を修正
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=8, # originally 8
    num_train_epochs=1.0, # originally 1
    evaluation_strategy="steps", eval_steps=10)  # 10ステップ毎に検証データで評価する

# Trainerのインスタンス化
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=valid_tokenized_dataset,
    compute_metrics=compute_metrics,
)

# 訓練を実行
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
10,No log,0.379911,0.16
20,No log,0.369718,0.146667


TrainOutput(global_step=28, training_loss=0.36614915302821566, metrics={'train_runtime': 3.8327, 'train_samples_per_second': 58.444, 'train_steps_per_second': 7.306, 'total_flos': 58940051423232.0, 'train_loss': 0.36614915302821566, 'epoch': 1.0})

In [69]:
# https://www.delftstack.com/ja/howto/numpy/numpy-softmax/
def np_softmax(x):
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x

In [70]:
# テキストを感情解析する関数
def analyze_emotion(text):
    # 推論モード
    model.eval()

    # 入力データ変換 + 推論
    tokens = tokenizer(text, truncation=True, return_tensors="pt")
    tokens.to(model.device)
    preds = model(**tokens)
    prob = np_softmax(preds.logits.cpu().detach().numpy()[0])
    out_dict = {n: p for n, p in zip(emotion_names_jp, prob)}
    out_list = list(out_dict.values())
    return out_list

In [71]:
# 結果を保存する空のリストを作成
predicted_labels = []

# test_tokenized_datasetからSentenceカラムのデータを取得
sentences = test_tokenized_dataset['Sentence']

# 各テキストにanalyze_emotion関数を適用し、結果をリストに保存
for text in sentences:
    result = analyze_emotion(text)
    predicted_labels.append(result)

true_labels = test_tokenized_dataset['labels']

In [72]:
# 予測結果と真のラベルをDataFrameに変換
predicted_df = pd.DataFrame(predicted_labels, columns=emotion_names_jp)
true_df = pd.DataFrame(true_labels, columns=emotion_names_jp)

In [73]:
predicted_df

Unnamed: 0,喜び,悲しみ,期待,驚き,怒り,恐れ,嫌悪,信頼
0,0.123901,0.134909,0.133190,0.122655,0.115807,0.137655,0.111521,0.120361
1,0.123715,0.134854,0.129580,0.131150,0.102251,0.128790,0.113987,0.135671
2,0.119426,0.133836,0.130413,0.121691,0.106651,0.133787,0.117834,0.136362
3,0.136457,0.128723,0.157675,0.122087,0.109651,0.124035,0.108002,0.113371
4,0.136299,0.104186,0.188281,0.116786,0.085835,0.116474,0.109730,0.142409
...,...,...,...,...,...,...,...,...
70,0.121955,0.138370,0.143971,0.124927,0.121022,0.136234,0.118408,0.095111
71,0.124824,0.139044,0.130428,0.122932,0.123088,0.120874,0.119115,0.119695
72,0.115283,0.143093,0.133416,0.117486,0.108497,0.134517,0.119981,0.127727
73,0.123758,0.130461,0.131758,0.126298,0.107859,0.127195,0.130747,0.121924


In [62]:
# DataFrameの各行を更新して、最大値に1、それ以外に0を持つようにする
def update_dataframe(df):
    for index, row in df.iterrows():
        max_value = row.max()
        df.loc[index] = (row == max_value).astype(int)
    return df

In [63]:
predicted_process_values = update_dataframe(predicted_df)
true_process_values = update_dataframe(true_df)

In [64]:
# 各DataFrameから最大の感情を抽出
def get_max_emotions(df):
    max_emotions = []
    for index, row in df.iterrows():
        max_emotions.append(row.index[row == 1].tolist())
    return pd.DataFrame({'Emotions': max_emotions})

predicted_emotions = get_max_emotions(predicted_df)
true_emotions = get_max_emotions(true_df)

In [65]:
# true_emotionsのリストサイズが2以上の場合、以下の2つを実行
# 1:predリストの感情がtrueリストにある場合は、predリストと一致する感情を除いてtrueリスト内の感情を削除 
# 2:predリストの感情がtrueリストにない場合は、両リストの感情をすべて削除して空リストにする→混同行列・F1スコア計算には含まないエントリとして扱う
def remove_extra_emotions(predicted_emotions, true_emotions):
    for idx, (pred, true) in zip(predicted_emotions.index, zip(predicted_emotions['Emotions'], true_emotions['Emotions'])):
        if len(true) >= 2:
            true_emotions.at[idx, 'Emotions'] = [emotion for emotion in true if emotion in pred] if any(emotion in pred for emotion in true) else []

remove_extra_emotions(predicted_emotions, true_emotions)

In [66]:
# 混同行列の作成
confusion_matrix_data = pd.DataFrame(0, index=emotion_labels, columns=emotion_labels)
for pred, true in zip(predicted_emotions['Emotions'], true_emotions['Emotions']):
    for pred_label in pred:
        if pred_label in emotion_labels:
            for true_label in true:
                if true_label in emotion_labels:
                    confusion_matrix_data.at[true_label, pred_label] += 1

# 各列と各行に合計値を追加
confusion_matrix_data['合計'] = confusion_matrix_data.sum(axis=1)
confusion_matrix_data.loc['合計'] = confusion_matrix_data.sum()

# 混同行列に明記
confusion_matrix_data.index.name = '予測値'
confusion_matrix_data.columns.name = '正解値'

In [67]:
confusion_matrix_data

正解値,悲しみ,驚き,喜び,期待,期待,悲しみ,恐れ,信頼,期待,期待,...,悲しみ,恐れ,喜び,恐れ,恐れ,喜び,喜び,期待,喜び,合計
予測値,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
悲しみ,2,0,0,3,3,2,0,0,3,3,...,2,0,0,0,0,0,0,3,0,17959
驚き,4,0,0,3,3,4,2,0,3,3,...,4,2,0,2,2,0,0,3,0,26941
喜び,3,0,0,5,5,3,1,0,5,5,...,3,1,0,1,1,0,0,5,0,30556
期待,1,0,0,10,10,1,0,0,10,10,...,1,0,0,0,0,0,0,10,0,43430
期待,1,0,0,10,10,1,0,0,10,10,...,1,0,0,0,0,0,0,10,0,43430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
喜び,3,0,0,5,5,3,1,0,5,5,...,3,1,0,1,1,0,0,5,0,30556
喜び,3,0,0,5,5,3,1,0,5,5,...,3,1,0,1,1,0,0,5,0,30556
期待,1,0,0,10,10,1,0,0,10,10,...,1,0,0,0,0,0,0,10,0,43430
喜び,3,0,0,5,5,3,1,0,5,5,...,3,1,0,1,1,0,0,5,0,30556


In [68]:
# 各感情ラベルのPrecision、Recall、F1スコアを計算
f1_scores = {}
for emotion_label in emotion_names_jp:
    tp = confusion_matrix_data.at[emotion_label, emotion_label]
    fp = confusion_matrix_data.loc[emotion_label, '合計'] - tp
    fn = confusion_matrix_data.loc['合計', emotion_label] - tp
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    f1_scores[emotion_label] = {'Precision': precision, 'Recall': recall, 'F1 Score': f1_score}

# 結果を表示
f1_scores_df = pd.DataFrame.from_dict(f1_scores, orient='index')
print("各感情ラベルのPrecision、Recall、F1スコア:")
print(f1_scores_df)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().