### Empathetic Dialogues Classification (For Testing)
1. import 基本的package
2. 以下程式是個人環境的設置，可自行移除
'''
device = torch.device(1)
torch.cuda.set_device(device)
'''

In [1]:
import numpy as np
import pandas as pd
import torch
import yaml
import os
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from everythingDataset import EverythingDataset
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve,confusion_matrix

torch.cuda.is_available() # check if GPU is ready
#device = torch.device(1)
#torch.cuda.set_device(device)
#torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


False

### Data 前處理 - 將句子合成成完整對話
1. ed_data_extract.py會將"data"這個資料夾內的fixed_train.csv, fixed_valid.cv, fixed_test.csv進行前處理，並放置在"data/empathetic_dialogues"
2. 另有一個EDA.ipynb，是將處理完的"data/empathetic_dialogues/train.csv"再做EDA處理，輸出data/empathetic_dialogues/EDA_train.csv"，但因為最終的best model並不是有使用EDA的，因此這邊可以忽略

In [2]:
# preprocess csv
!python ed_data_extract.py

Datatype: train
Datatype: valid
Datatype: test


### 從config.yaml載入相關資訊，eval跟train共用一個config，因此其實有些參數並不會用到
- output_folder: submission.csv存放的地方
- random_seed:如字面的意思，但最後沒有用到xD
- pretrain_model_name: 從huggingface載入哪個pretrain model
- epoch: train的epoch數
- batch_size: GPU同時訓練的batch size，視GPU能力設定
- label_num: 32種情緒
- train_file_path: 前處理過的csv，應該設定為"data/empathetic_dialogues/train.csv"，除非有手動換地方
- valid_file_path: 前處理過的csv，應該設定為"data/empathetic_dialogues/valid.csv"，除非有手動換地方
- test_file_path: 前處理過的csv，應該設定為"data/empathetic_dialogues/test.csv"，除非有手動換地方
- original_test_file_path: 未處理過的fixed_test.csv，應該設定為"data/fixed_test.csv"，除非有手動換地方
- model_path: 存放訓練過model的資料夾

### 注意! 這邊會印出config裡面的資訊，因此先確認config資訊是否無誤

In [3]:
# load config from config.yaml
cfg = []
with open('config.yaml', 'r') as yaml_f:
    cfg = yaml.load(yaml_f, Loader=yaml.FullLoader)

output_folder = cfg.get('output_folder')
random_seed = cfg.get('random_seed')
pretrain_model_name = cfg.get('pretrain_model_name')
epoch = cfg.get('epoch')
batch_size = cfg.get('batch_size')
label_num = cfg.get('label_num')
train_file_path = cfg.get('train_file_path')
valid_file_path = cfg.get('valid_file_path')
test_file_path = cfg.get('test_file_path')
original_test_file_path = cfg.get('original_test_file_path') # use for output sample
model_path = cfg.get('model_path')

print(f'############## showing config ##############')
print(f'output_folder: {output_folder}')
print(f'random_seed: {random_seed}')
print(f'pretrain_model_name: {pretrain_model_name}')
print(f'epoch: {epoch}')
print(f'batch_size: {batch_size}')
print(f'label_num: {label_num}')
print(f'train_file_path: {train_file_path}')
print(f'valid_file_path: {valid_file_path}')
print(f'test_file_path: {test_file_path}')
print(f'original_test_file_path: {original_test_file_path}')
print(f'model_path: {model_path}')
print(f'############## showing config ##############')

############## showing config ##############
output_folder: output
random_seed: 777
pretrain_model_name: roberta-base
epoch: 10
batch_size: 1
label_num: 32
train_file_path: data/empathetic_dialogues/train.csv
valid_file_path: data/empathetic_dialogues/valid.csv
test_file_path: data/empathetic_dialogues/test.csv
original_test_file_path: data/fixed_test.csv
model_path: best_model_4500_6000
############## showing config ##############


### 載入data並顯示前五筆資料

In [4]:
train_df = pd.read_csv(train_file_path)
valid_df = pd.read_csv(valid_file_path)
test_df = pd.read_csv(test_file_path)
print(f'train num: {len(train_df)},  valid num: {len(valid_df)}, test num: {len(test_df)}\n')
print(f'first 10 row in train_df:')
train_df.head(5)

train num: 19533,  valid num: 2770, test num: 2547

first 10 row in train_df:


Unnamed: 0,prompt,utterance_data,speaker_utterance,listener_utterance,emotion,emotion_label,speaker_info
0,I remember going to the fireworks with my best...,I remember going to see the fireworks with my ...,I remember going to see the fireworks with my ...,"Was this a friend you were in love with, or ju...",13,sentimental,"[0, 1, 0, 1, 0, 1]"
1,i used to scare for darkness,it feels like hitting to blank wall when i se...,it feels like hitting to blank wall when i se...,Oh ya? I don't really see howI do actually hit...,11,afraid,"[0, 1, 0, 1, 0, 1]"
2,I showed a guy how to run a good bead in weldi...,Hi how are you doing todaydoing good.. how abo...,"Hi how are you doing todayIm good, trying to u...",doing good.. how about youit's quite strange t...,29,proud,"[0, 1, 0, 1, 0]"
3,I have always been loyal to my wife.,I have never cheated on my wife.And thats some...,I have never cheated on my wife.Yea it hasn't ...,"And thats something you should never do, good ...",5,faithful,"[0, 1, 0, 1]"
4,A recent job interview that I had made me feel...,"Job interviews always make me sweat bullets, m...","Job interviews always make me sweat bullets, m...",Don't be nervous. Just be prepared.Yes but if ...,2,terrified,"[0, 1, 0, 1, 0]"


### 載入tokenizer和metric

In [5]:
tokenizer = AutoTokenizer.from_pretrained(pretrain_model_name)
metric_acc = load_metric('accuracy')
metric_f1 = load_metric('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)['accuracy']
    f1_score = metric_f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    return {'accuracy': acc, 'F1': f1_score}

Downloading: 100%|██████████| 481/481 [00:00<00:00, 240kB/s]
Downloading: 100%|██████████| 878k/878k [00:01<00:00, 534kB/s]  
Downloading: 100%|██████████| 446k/446k [00:01<00:00, 364kB/s]  
Downloading: 100%|██████████| 1.29M/1.29M [00:01<00:00, 810kB/s] 


### Testing trained model
1. 從config.yaml取得model_path並載入trained model
2. 輸出submission.csv
#### 注意!!! "model_path"資料夾需含有config.json和pytorch_model.bin兩個檔案才能運作!!! 

In [7]:
# test
# load trained model
# model = AutoModelForSequenceClassification.from_pretrained('best_model')  # 也可以自己手動輸入資料夾路徑
model = AutoModelForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir='model',
        logging_dir='log',
        logging_strategy='epoch',
        evaluation_strategy='epoch',
        num_train_epochs=epoch,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size),
    train_dataset=EverythingDataset(train_df, tokenizer),
    eval_dataset=EverythingDataset(valid_df, tokenizer),
    compute_metrics=compute_metrics
)

test_preds = trainer.predict(EverythingDataset(test_df, tokenizer))
# new_test = pd.read_csv('data/empathetic_dialogues/test.csv')
new_test = pd.read_csv(test_file_path)
test_ans = np.argmax(test_preds.predictions, axis=-1)
new_test['pred'] = test_ans

# submission = pd.read_csv('data/fixed_test.csv')
submission = pd.read_csv(original_test_file_path)
submission['pred'] = [-1]*len(submission)
for _, row in new_test.iterrows():
  submission.loc[(submission['conv_id'] == row['conv_id']), 'pred'] = row['pred']

if not os.path.isdir('output'):
    os.mkdir('output')
submission[['pred']].to_csv('output/submission.csv', encoding='utf8')

submission

***** Running Prediction *****
  Num examples = 2547
  Batch size = 1
100%|██████████| 2547/2547 [1:08:46<00:00,  1.74s/it]

Unnamed: 0,conv_id,utterance_idx,prompt,utterance,pred
0,hit:0_conv:0,1,I felt guilty when I was driving home one nigh...,Yeah about 10 years ago I had a horrifying exp...,25
1,hit:0_conv:0,2,I felt guilty when I was driving home one nigh...,Did you suffer any injuries?,25
2,hit:0_conv:0,3,I felt guilty when I was driving home one nigh...,No I wasn't hit. It turned out they were drunk...,25
3,hit:0_conv:0,4,I felt guilty when I was driving home one nigh...,Why did you feel guilty? People really shouldn...,25
4,hit:0_conv:0,5,I felt guilty when I was driving home one nigh...,I don't know I was new to driving and hadn't e...,25
...,...,...,...,...,...
10968,hit:12416_conv:24832,4,I saw a huge cockroach outside my house today....,I live in Texas to so i know those feels,8
10969,hit:12423_conv:24847,1,I have a big test on Monday. I am so nervous_c...,I have a big test on Monday_comma_ I am so ner...,18
10970,hit:12423_conv:24847,2,I have a big test on Monday. I am so nervous_c...,What is the test on?,18
10971,hit:12423_conv:24847,3,I have a big test on Monday. I am so nervous_c...,It's for my Chemistry class. I haven't slept m...,18


### 顯示valid data的confusion matrix

In [None]:
ed_label_dict = {'sad': 0, 'trusting': 1, 'terrified': 2, 'caring': 3, 'disappointed': 4,
         'faithful': 5, 'joyful': 6, 'jealous': 7, 'disgusted': 8, 'surprised': 9,
         'ashamed': 10, 'afraid': 11, 'impressed': 12, 'sentimental': 13, 
         'devastated': 14, 'excited': 15, 'anticipating': 16, 'annoyed': 17, 'anxious': 18,
         'furious': 19, 'content': 20, 'lonely': 21, 'angry': 22, 'confident': 23,
         'apprehensive': 24, 'guilty': 25, 'embarrassed': 26, 'grateful': 27,
         'hopeful': 28, 'proud': 29, 'prepared': 30, 'nostalgic': 31}

ed_emo_dict =  {v: k for k, v in ed_label_dict.items()}

valid_preds = trainer.predict(EverythingDataset(valid_df, tokenizer))
valid_ans = np.argmax(valid_preds.predictions, axis=-1)

cm = confusion_matrix(valid_df['emotion'], valid_ans, labels=np.unique(valid_df['emotion']))
labels = [ed_emo_dict[i] for i in range(32)]
pd_annot=pd.DataFrame(cm,index=labels,columns=labels)

font = {'family': 'sans-serif',
            'color': 'k',
            'weight': 'normal',
            'size': 20,}

f, ax = plt.subplots(figsize=(20, 20))
cmap = 'YlGnBu'
ax = sns.heatmap(pd_annot, annot=True, ax=ax, fmt='d', cmap=cmap) #畫heatmap

plt.xlabel('Predict',fontsize=20, color='k')
plt.ylabel('Truth',fontsize=20, color='k')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('Confusion Matrix',fontsize=20)
#設置colorbar的刻度字體大小
cax = plt.gcf().axes[-1] 
cax.tick_params(labelsize=20)
#設置colorbar的label文本和字體大小
cbar = ax.collections[0].colorbar
cbar.set_label(r'$NMI$',fontdict=font)
plt.savefig(output_folder + '/confusion matrix.png')
plt.show()