In [1]:
import pandas as pd
# 讀取資料集
train = pd.read_csv("train/fixed_train.csv")#.sample(n=5000)
valid = pd.read_csv("train/fixed_valid.csv")#.sample(n=1000)
test = pd.read_csv("train/fixed_test.csv")#.sample(n=1000)
train.head()

Unnamed: 0,conv_id,utterance_idx,prompt,utterance,label
0,hit:0_conv:1,1,I remember going to the fireworks with my best...,I remember going to see the fireworks with my ...,13
1,hit:0_conv:1,2,I remember going to the fireworks with my best...,Was this a friend you were in love with_comma_...,13
2,hit:0_conv:1,3,I remember going to the fireworks with my best...,This was a best friend. I miss her.,13
3,hit:0_conv:1,4,I remember going to the fireworks with my best...,Where has she gone?,13
4,hit:0_conv:1,5,I remember going to the fireworks with my best...,We no longer talk.,13


In [2]:
# data cleaning about _comma_
train['prompt'] = train['prompt'].map(lambda x:x.replace('_comma_',','))
train['utterance'] = train['utterance'].map(lambda x:x.replace('_comma_',','))
valid['prompt'] = valid['prompt'].map(lambda x:x.replace('_comma_',','))
valid['utterance'] = valid['utterance'].map(lambda x:x.replace('_comma_',','))
test['prompt'] = test['prompt'].map(lambda x:x.replace('_comma_',','))
test['utterance'] = test['utterance'].map(lambda x:x.replace('_comma_',','))

In [3]:
print("訓練樣本數：", len(train))
print("驗證樣本數：", len(valid))
print("測試樣本數：", len(test))

訓練樣本數： 84169
驗證樣本數： 12078
測試樣本數： 10973


In [4]:
emotion_label = {'sad': 0, 'trusting': 1, 'terrified': 2, 'caring': 3, 'disappointed': 4,
         'faithful': 5, 'joyful': 6, 'jealous': 7, 'disgusted': 8, 'surprised': 9,
         'ashamed': 10, 'afraid': 11, 'impressed': 12, 'sentimental': 13, 
         'devastated': 14, 'excited': 15, 'anticipating': 16, 'annoyed': 17, 'anxious': 18,
         'furious': 19, 'content': 20, 'lonely': 21, 'angry': 22, 'confident': 23,
         'apprehensive': 24, 'guilty': 25, 'embarrassed': 26, 'grateful': 27,
         'hopeful': 28, 'proud': 29, 'prepared': 30, 'nostalgic': 31}
len(emotion_label)

32

In [5]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset
from IPython.display import clear_output

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmotionDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, df, tokenizer):
        assert mode in ["train", "valid", "test"]
        self.mode = mode
        self.df = df.fillna("")
        self.len = len(self.df)
        self.label_map = emotion_label
#         {'sad': 0, 'trusting': 1, 'terrified': 2, 'caring': 3, 'disappointed': 4,
#          'faithful': 5, 'joyful': 6, 'jealous': 7, 'disgusted': 8, 'surprised': 9,
#          'ashamed': 10, 'afraid': 11, 'impressed': 12, 'sentimental': 13, 
#          'devastated': 14, 'excited': 15, 'anticipating': 16, 'annoyed': 17, 'anxious': 18,
#          'furious': 19, 'content': 20, 'lonely': 21, 'angry': 22, 'confident': 23,
#          'apprehensive': 24, 'guilty': 25, 'embarrassed': 26, 'grateful': 27,
#          'hopeful': 28, 'proud': 29, 'prepared': 30, 'nostalgic': 31}
        self.tokenizer = tokenizer
    
    # 這裡需要定義回傳一筆訓練 / 測試數據的函式，
    # 也就是當以 [idx] 來 index Dataset 時，要回傳的東西
    def __getitem__(self, idx):
        if self.mode == "test":
            prompt, utterance = self.df.iloc[idx, 2:].values
            label_tensor = None
        else:
            prompt, utterance, label = self.df.iloc[idx, 2:].values
            label_tensor = torch.tensor(label)
            
        #text = '[CLS]' + prompt + ' [SEP]' + utterance + ' [SEP]'
        text = '[CLS]' + utterance + ' [SEP]'
#         tensor_arr = []
#         i = 0
#         for seg in utterance.split(". "):
#             text = text + seg + ' [SEP]'
#             tensor_arr.extend([i] * len(seg.split()))
#             i+=1
        word_pieces = tokenizer.tokenize(text)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        #segments_tensor = torch.tensor([0] * len(prompt) + [1] * len(utterance),dtype=torch.long)
        segments_tensor = torch.tensor([0] * len(utterance.split()),dtype=torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
# 初始化 Dataset
trainset = EmotionDataset("train", train, tokenizer=tokenizer)
validset = EmotionDataset("valid", valid, tokenizer=tokenizer)

In [6]:
sample_idx = 0

# 將原始文本拿出做比較
prompt, utterance, label = trainset.df.iloc[sample_idx,2:].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors，
# 經過我們自定義的 Dataset 後，trainset 現在已經是個 iterable 的 object，
# 可以用編號來索引你想要去得的位置的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = " ".join(tokens)

print(f"""[原始文本]
句子 1：{prompt}
句子 2：{utterance}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1：I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.
句子 2：I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.
分類  ：13

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([  101,  1045,  3342,  2183,  2000,  2156,  1996, 16080,  2007,  2026,
         2190,  2767,  1012,  2009,  2001,  1996,  2034,  2051,  2057,  2412,
         2985,  2051,  2894,  2362,  1012,  2348,  2045,  2001,  1037,  2843,
         1997,  2111,  1010,  2057,  2371,  2066,  1996,  2069,  2111,  1999,
         1996,  2088,  1012,   102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

label_tensor   ：13

--------------------

[還原 tokens_tensors]
[CLS] i remember going to see the fire

In [7]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# create_mini_batch 的參數 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # [TODO3]: 將 token_tensors 及 segments_tensors zero padding 到同樣長度，
    # hint: 可以使用 import的 pad_sequence，記得 batch_first 要設為 True
    #================================================
    max_tok = 0
    for i in range(1,len(tokens_tensors)):
        if tokens_tensors[i].shape > tokens_tensors[max_tok].shape:
            max_tok = i
    temp = pad_sequence([segments_tensors[0],tokens_tensors[max_tok]],batch_first=True)
    segments_tensors[0] = temp[0]
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    #================================================

    masks_tensors = torch.zeros(tokens_tensors.shape,
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

trainloader = DataLoader(trainset, batch_size=128, 
                         collate_fn=create_mini_batch)

validloader = DataLoader(validset, batch_size=256, 
                         collate_fn=create_mini_batch)

In [8]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([128, 49]) 
tensor([[ 101, 1045, 3342,  ...,    0,    0,    0],
        [ 101, 2001, 2023,  ...,    0,    0,    0],
        [ 101, 2023, 2001,  ...,    0,    0,    0],
        ...,
        [ 101, 1037, 2095,  ...,    0,    0,    0],
        [ 101, 2054, 2001,  ...,    0,    0,    0],
        [ 101, 2009, 2001,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([128, 49])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([128, 49])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape     

In [9]:
# 載入一個預訓練好可以做多分類任務的模型，n_class = 32
from transformers import BertForSequenceClassification
from IPython.display import clear_output

NUM_LABELS = 32
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)

clear_output()

In [10]:
%%time
from tqdm.notebook import tqdm
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    running_loss = 0
    with torch.no_grad():
        for data in tqdm(dataloader):
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
                
            if not compute_acc:
                # 只是單純要回傳預測值的話，不用計算準確度也不用紀錄 loss
                tokens_tensors, segments_tensors, masks_tensors = data[:3]
                outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
                logits = outputs[0]
                _, pred = torch.max(logits.data, 1)
            else:
                # 否則就要計算 loss，這邊有一個小細節是 model 如果有吃 label 的話，
                # output[0]會變成是 loss，沒有吃 label 時 output[0] 會是 logits
                tokens_tensors, segments_tensors, masks_tensors, labels = data[:4]
                outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors,
                                labels=labels)
                loss = outputs[0]
                logits = outputs[1]
                _, pred = torch.max(logits.data, 1)
                running_loss += loss.item()
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
                
    
    if compute_acc:
        acc = correct / total
        loss = running_loss / total
        return predictions, acc, loss
    
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
# 這邊要記得確認 model 在 GPU 上運行 (投影片有說明)，否則會跑很久！
print("device:", device)
model = model.to(device)
_, train_acc, train_loss = get_predictions(model, trainloader, compute_acc=True)
print("train acc:", train_acc)
print("train loss:", train_loss)

device: cuda:0


  0%|          | 0/658 [00:00<?, ?it/s]

train acc: 0.03710392187147287
train loss: 0.027305214338408327
CPU times: user 1min 32s, sys: 26.3 s, total: 1min 58s
Wall time: 1min 57s


In [14]:
%%time
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 3  # 由於時間有限，訓練 3 輪看看表現如何就好
for epoch in range(EPOCHS):
    running_loss = 0.0
    for data in tqdm(trainloader):
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad()
        
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

    # 計算分類準確率
    _, train_acc, train_loss = get_predictions(model, trainloader, compute_acc=True)
    _, valid_acc, valid_loss = get_predictions(model, validloader, compute_acc=True)
    
    print('[epoch %d] train loss: %.3f, train acc: %.3f, valid loss: %.3f, valid acc: %.3f' %
          (epoch + 1, train_loss, train_acc, valid_loss, valid_acc))

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

[epoch 1] train loss: 0.021, train acc: 0.245, valid loss: 0.011, valid acc: 0.243


  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

[epoch 2] train loss: 0.019, train acc: 0.294, valid loss: 0.010, valid acc: 0.278


  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

[epoch 3] train loss: 0.018, train acc: 0.329, valid loss: 0.010, valid acc: 0.292
CPU times: user 15min 20s, sys: 5min 18s, total: 20min 38s
Wall time: 20min 27s


In [18]:
torch.save(model.state_dict(), 'checkpoint.pt')

In [20]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=NUM_LABELS)
model.load_state_dict(torch.load("checkpoint.pt"))
model.to(device)
model.eval()
clear_output()

In [17]:
test

Unnamed: 0,conv_id,utterance_idx,prompt,utterance
0,hit:0_conv:0,1,I felt guilty when I was driving home one nigh...,Yeah about 10 years ago I had a horrifying exp...
1,hit:0_conv:0,2,I felt guilty when I was driving home one nigh...,Did you suffer any injuries?
2,hit:0_conv:0,3,I felt guilty when I was driving home one nigh...,No I wasn't hit. It turned out they were drunk...
3,hit:0_conv:0,4,I felt guilty when I was driving home one nigh...,Why did you feel guilty? People really shouldn...
4,hit:0_conv:0,5,I felt guilty when I was driving home one nigh...,I don't know I was new to driving and hadn't e...
...,...,...,...,...
10968,hit:12416_conv:24832,4,I saw a huge cockroach outside my house today....,I live in Texas to so i know those feels
10969,hit:12423_conv:24847,1,"I have a big test on Monday. I am so nervous, ...","I have a big test on Monday, I am so nervous."
10970,hit:12423_conv:24847,2,"I have a big test on Monday. I am so nervous, ...",What is the test on?
10971,hit:12423_conv:24847,3,"I have a big test on Monday. I am so nervous, ...",It's for my Chemistry class. I haven't slept m...


In [22]:
label2idx

{'sad': 0,
 'trusting': 1,
 'terrified': 2,
 'caring': 3,
 'disappointed': 4,
 'faithful': 5,
 'joyful': 6,
 'jealous': 7,
 'disgusted': 8,
 'surprised': 9,
 'ashamed': 10,
 'afraid': 11,
 'impressed': 12,
 'sentimental': 13,
 'devastated': 14,
 'excited': 15,
 'anticipating': 16,
 'annoyed': 17,
 'anxious': 18,
 'furious': 19,
 'content': 20,
 'lonely': 21,
 'angry': 22,
 'confident': 23,
 'apprehensive': 24,
 'guilty': 25,
 'embarrassed': 26,
 'grateful': 27,
 'hopeful': 28,
 'proud': 29,
 'prepared': 30,
 'nostalgic': 31}

In [21]:
testset = EmotionDataset("test", test, tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

# label 與 id 互相轉換的兩個 dictionary
label2idx = testset.label_map
idx2label = {v:k for k, v in label2idx.items()}

# 將測試集的 label 轉成 id，用來計算測試集上的準確度
test_y = test.label.apply(lambda x: label2idx[x]).values
print(test_y)

AttributeError: 'DataFrame' object has no attribute 'label'

In [23]:
# 請我們的模型給出它的預測！
predictions = get_predictions(model, testloader)
# 要和在 cpu 上的 test_y 算準確度，還要把它從 GPU 上搬回來才行
predictions = predictions.cpu().numpy()

  0%|          | 0/43 [00:00<?, ?it/s]

In [51]:
test_dict = dict(enumerate(predictions.flatten()))

In [52]:
import csv
with open('output.csv', 'w', newline='') as csvfile:
    # 定義欄位
    fieldnames = ['', 'pred']

    # 將 dictionary 寫入 CSV 檔
    writer = csv.writer(csvfile)

    # 寫入第一列的欄位名稱
    #writer.writeheader()

    # 寫入資料
    writer.writerow(fieldnames)
    for k, v in test_dict.items():
        writer.writerow([k, v])

In [24]:
from sklearn.metrics import accuracy_score
print("測試集上的準確度：", accuracy_score(test_y, predictions))

NameError: name 'test_y' is not defined