In [1]:
# # カーネルリスタートの時はこのセルを実行しなくてもOK
# !wget https://bootstrap.pypa.io/get-pip.py
# !python get-pip.py
# %pip install tokenizers fugashi ipadic accelerate==0.20.3 seaborn
# %pip install transformers datasets scikit-learn
# !wget https://github.com/ids-cv/wrime/raw/master/wrime-ver1.tsv
# %pip install -U imbalanced-learn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from torch import nn
from datasets import Dataset, load_metric
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AdamW, get_linear_schedule_with_warmup

In [3]:
df_wrime = pd.read_table('wrime-ver1.tsv')
emotion_names = ['Joy', 'Sadness', 'Anticipation', 'Surprise', 'Anger', 'Fear', 'Disgust', 'Trust']
emotion_names_jp = ['喜び', '悲しみ', '期待', '驚き', '怒り', '恐れ', '嫌悪', '信頼']
num_labels = len(emotion_names)

df_wrime['readers_emotion_intensities'] = df_wrime.apply(lambda x: [x['Avg. Readers_' + name] for name in emotion_names], axis=1)

# removing samples with less emotion intensities
# (max.readers_emotion_intensities must be 2 or more)
is_target = df_wrime['readers_emotion_intensities'].map(lambda x: max(x) >= 2)
df_wrime_target = df_wrime[is_target]

In [4]:
target_columns = ['Sentence', 'readers_emotion_intensities']


In [5]:
def update_dataframe(df):
    if 'readers_emotion_intensities' not in df.columns:
        raise ValueError("DataFrame does not contain 'readers_emotion_intensities' column.")
    
    # DataFrameをコピーして新しいDataFrameを作成
    updated_df = df.copy()
    
    # 'readers_emotion_intensities'列の各要素を更新
    for index, row in updated_df.iterrows():
        max_value = max(row['readers_emotion_intensities'])
        updated_df.at[index, 'readers_emotion_intensities'] = [int(value == max_value) for value in row['readers_emotion_intensities']]
    
    return updated_df


In [6]:
df_wrime_target_updated = update_dataframe(df_wrime_target)

In [7]:
# readers_emotion_intensities 列に1が2回以上登場する行を削除する
df_wrime_target_updated_filtered = df_wrime_target_updated[df_wrime_target_updated['readers_emotion_intensities'].apply(lambda x: x.count(1) < 2)]

In [8]:
# 感情ラベルを保存するための空のリストを作成
emotion_labels = []

# 各行に対して処理を行う
for index, row in df_wrime_target_updated_filtered.iterrows():
    # 1が格納されているインデックスを取得し、対応する感情ラベルを取得する
    emotions = [emotion_names_jp[i] for i, val in enumerate(row['readers_emotion_intensities']) if val == 1]
    # 感情ラベルがない場合は空文字列を追加する
    if len(emotions) == 0:
        emotion_labels.append('')
    else:
        # 複数の感情がある場合はカンマ区切りの文字列に変換して追加する
        emotion_labels.append(', '.join(emotions))

# 新しい感情カラムをデータフレームに追加する
df_wrime_target_updated_filtered['emotion'] = emotion_labels


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wrime_target_updated_filtered['emotion'] = emotion_labels


In [33]:
print(df_wrime_target_updated_filtered['emotion'].value_counts())


emotion
喜び     4441
期待     4053
悲しみ    2900
驚き     1955
恐れ     1591
嫌悪     1024
怒り      197
信頼       45
Name: count, dtype: int64


In [9]:
# 各感情ラベルごとのサンプル数をカウント
emotion_counts = df_wrime_target_updated_filtered['emotion'].value_counts()

# オーバーサンプリングのためのサンプル数の計算
max_samples = emotion_counts.max()  # 最大のサンプル数を基準にする
min_samples = emotion_counts.min()

# オーバーサンプリング
ros = RandomOverSampler(sampling_strategy={label: max_samples for label in emotion_counts.index})
oversampled_X, oversampled_y = ros.fit_resample(df_wrime_target_updated_filtered['Sentence'].values.reshape(-1, 1), df_wrime_target_updated_filtered['emotion'])

# データフレームに変換
df_wrime_target_updated_filtered_oversampled = pd.DataFrame({'Sentence': oversampled_X.flatten(), 'emotion': oversampled_y})


# 結果の表示
print(df_wrime_target_updated_filtered_oversampled['emotion'].value_counts())


emotion
悲しみ    4441
驚き     4441
喜び     4441
期待     4441
恐れ     4441
信頼     4441
怒り     4441
嫌悪     4441
Name: count, dtype: int64


In [10]:
def convert_emotion_to_binary(emotion_label):
    # ラベルが存在する場合は対応するバイナリデータを作成
    if emotion_label in emotion_names_jp:
        # インデックスを取得
        idx = emotion_names_jp.index(emotion_label)
        # インデックスに対応する位置を1、それ以外を0とするリストを作成
        binary_data = [1 if i == idx else 0 for i in range(len(emotion_names_jp))]
        return binary_data
    else:
        # ラベルが存在しない場合はエラーメッセージを出力してNoneを返す
        print("Invalid emotion label!")
        return [0] * len(emotion_names_jp)

# NaNを埋める
df_wrime_target_updated_filtered_oversampled['readers_emotion_intensities'] = df_wrime_target_updated_filtered_oversampled['emotion'].apply(convert_emotion_to_binary)


In [11]:
# 使用するモデルを指定して、Tokenizerを読み込む
checkpoint = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [12]:
# Divide into train, validation, and test sets
train_data, test_valid_data = train_test_split(df_wrime_target_updated_filtered_oversampled, test_size=0.4, random_state=42)
valid_data, test_data = train_test_split(test_valid_data, test_size=0.5, random_state=42)

In [13]:
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch['Sentence'], truncation=True, padding='max_length', return_tensors="pt")
    tokenized_batch['labels'] = [x / np.sum(x) for x in batch['readers_emotion_intensities']]
    return tokenized_batch

In [14]:
# Transformers用のデータセット形式に変換
# pandas.DataFrame -> datasets.Dataset
target_columns = ['Sentence', 'readers_emotion_intensities']
train_dataset = Dataset.from_pandas(train_data[target_columns])
valid_dataset = Dataset.from_pandas(valid_data[target_columns])
test_dataset = Dataset.from_pandas(test_data[target_columns])

# 前処理（tokenize_function） を適用
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
valid_tokenized_dataset = valid_dataset.map(tokenize_function, batched=True)
test_tokenized_dataset = test_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/21316 [00:00<?, ? examples/s]

Map:   0%|          | 0/7106 [00:00<?, ? examples/s]

Map:   0%|          | 0/7106 [00:00<?, ? examples/s]

In [15]:
# https://huggingface.co/docs/transformers/training
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    label_ids = np.argmax(labels, axis=-1)
    return metric.compute(predictions=predictions, references=label_ids)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [16]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# %pip install accelerate transformers[torch] -U


訓練時にerror → 上のコードセルをコメントアウト → 実行 → (仮想環境をdeactivate + restart vscode)もしくは(カーネルリスタート) → 上のコードセルをコメントアウト → run all the cells againで解決


In [18]:
# Transformers の Trainer を用いる
# https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/trainer#transformers.TrainingArguments

# 訓練時の設定を修正
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=16, # originally 8
    num_train_epochs=3.0, # originally 1
    evaluation_strategy="steps", eval_steps=500)  # 500ステップ毎に検証データで評価する

# Trainerを生成
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=valid_tokenized_dataset,  # 検証データを使用する
    compute_metrics=compute_metrics,
)

# 訓練を実行
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.2444,0.163124,0.746834
1000,0.149,0.125771,0.808331
1500,0.1035,0.109874,0.842387
2000,0.0666,0.102253,0.853786
2500,0.0607,0.090913,0.876724
3000,0.0386,0.097098,0.878694
3500,0.0269,0.095092,0.885167


Checkpoint destination directory test_trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-3500 already exists and is non-empty.Saving will proceed

TrainOutput(global_step=3999, training_loss=0.08923298539325994, metrics={'train_runtime': 1042.4949, 'train_samples_per_second': 61.341, 'train_steps_per_second': 3.836, 'total_flos': 1.6826332180414464e+16, 'train_loss': 0.08923298539325994, 'epoch': 3.0})

In [19]:
# https://www.delftstack.com/ja/howto/numpy/numpy-softmax/
def np_softmax(x):
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x

In [20]:
# テキストを感情解析する関数
def analyze_emotion(text):
    # 推論モード
    model.eval()

    # 入力データ変換 + 推論
    tokens = tokenizer(text, truncation=True, return_tensors="pt")
    tokens.to(model.device)
    preds = model(**tokens)
    prob = np_softmax(preds.logits.cpu().detach().numpy()[0])
    out_dict = {n: p for n, p in zip(emotion_names_jp, prob)}
    out_list = list(out_dict.values())
    return out_list

In [21]:
# 結果を保存する空のリストを作成
predicted_labels = []

# test_tokenized_datasetからSentenceカラムのデータを取得
sentences = test_tokenized_dataset['Sentence']

# 各テキストにanalyze_emotion関数を適用し、結果をリストに保存
for text in sentences:
    result = analyze_emotion(text)
    predicted_labels.append(result)

true_labels = test_tokenized_dataset['labels']

In [22]:
# 予測結果と真のラベルをDataFrameに変換
predicted_df = pd.DataFrame(predicted_labels, columns=emotion_names_jp)
true_df = pd.DataFrame(true_labels, columns=emotion_names_jp)

In [23]:
# DataFrameの各行を更新して、最大値に1、それ以外に0を持つようにする
def update_dataframe(df):
    for index, row in df.iterrows():
        max_value = row.max()
        df.loc[index] = (row == max_value).astype(int)
    return df

In [24]:
predicted_process_values = update_dataframe(predicted_df)
true_process_values = update_dataframe(true_df)

In [25]:
# 各DataFrameから最大の感情を抽出
def get_max_emotions(df):
    max_emotions = []
    for index, row in df.iterrows():
        max_emotions.append(row.index[row == 1].tolist())
    return pd.DataFrame({'Emotions': max_emotions})

predicted_emotions = get_max_emotions(predicted_df)
true_emotions = get_max_emotions(true_df)

In [26]:
# true_emotionsのリストサイズが2以上の場合、以下の2つを実行
# 1:predリストの感情がtrueリストにある場合は、predリストと一致する感情を除いてtrueリスト内の感情を削除 
# 2:predリストの感情がtrueリストにない場合は、両リストの感情をすべて削除して空リストにする→混同行列・F1スコア計算には含まないエントリとして扱う
def remove_extra_emotions(predicted_emotions, true_emotions):
    for idx, (pred, true) in zip(predicted_emotions.index, zip(predicted_emotions['Emotions'], true_emotions['Emotions'])):
        if len(true) >= 2:
            true_emotions.at[idx, 'Emotions'] = [emotion for emotion in true if emotion in pred] if any(emotion in pred for emotion in true) else []

remove_extra_emotions(predicted_emotions, true_emotions)

In [27]:
# 感情ラベルを数値に変換する関数を定義
def label_to_index(label):
    return emotion_names_jp.index(label)

# 予測値と正解値の感情ラベルを数値に変換
predicted_indices = [label_to_index(label) for labels in predicted_emotions['Emotions'] for label in labels]
true_indices = [label_to_index(label) for labels in true_emotions['Emotions'] for label in labels]

# 混同行列を作成
confusion_matrix_data = confusion_matrix(true_indices, predicted_indices)

# 混同行列をDataFrameに変換
confusion_matrix_df = pd.DataFrame(confusion_matrix_data, index=[f'真: {label}' for label in emotion_names_jp], columns=[f'予: {label}' for label in emotion_names_jp])

# 各行と各列の合計を追加
confusion_matrix_df['合計'] = confusion_matrix_df.sum(axis=1)
confusion_matrix_df.loc['合計'] = confusion_matrix_df.sum()

In [28]:
confusion_matrix_df

Unnamed: 0,予: 喜び,予: 悲しみ,予: 期待,予: 驚き,予: 怒り,予: 恐れ,予: 嫌悪,予: 信頼,合計
真: 喜び,719,31,82,43,1,13,7,1,897
真: 悲しみ,12,734,36,36,1,60,23,0,902
真: 期待,75,28,718,22,0,25,6,0,874
真: 驚き,47,14,16,766,0,31,14,0,888
真: 怒り,0,0,0,0,886,0,0,0,886
真: 恐れ,7,32,22,20,0,776,17,0,874
真: 嫌悪,6,15,2,8,3,11,828,0,873
真: 信頼,0,0,0,0,0,0,0,912,912
合計,866,854,876,895,891,916,895,913,7106


In [29]:
# 各感情ラベルのPrecision、Recall、F1スコアを計算する関数を定義
def calculate_metrics(confusion_matrix):
    metrics = {}
    for i in range(len(emotion_names_jp)):
        true_positive = confusion_matrix[i, i]
        false_positive = confusion_matrix[:, i].sum() - true_positive
        false_negative = confusion_matrix[i, :].sum() - true_positive
        precision = true_positive / (true_positive + false_positive) if true_positive + false_positive > 0 else 0
        recall = true_positive / (true_positive + false_negative) if true_positive + false_negative > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        metrics[emotion_names_jp[i]] = {'Precision': precision, 'Recall': recall, 'F1 Score': f1_score}
    return metrics

# 混同行列からPrecision、Recall、F1スコアを計算
metrics = calculate_metrics(confusion_matrix_data)

metrics_df = pd.DataFrame(metrics).transpose()

In [30]:
metrics_df

Unnamed: 0,Precision,Recall,F1 Score
喜び,0.830254,0.801561,0.815655
悲しみ,0.859485,0.813747,0.835991
期待,0.819635,0.82151,0.820571
驚き,0.855866,0.862613,0.859226
怒り,0.994388,1.0,0.997186
恐れ,0.847162,0.887872,0.867039
嫌悪,0.92514,0.948454,0.936652
信頼,0.998905,1.0,0.999452


In [31]:
model.save_pretrained("/workspace/0212_omg_its_working_finally.bin")

In [32]:
!pwd

/workspace/0212_omg_its_working_finally
