In [1]:
import pandas as pd
# 讀取資料集
train = pd.read_csv("train/fixed_train.csv")
valid = pd.read_csv("train/fixed_valid.csv")
test = pd.read_csv("train/fixed_test.csv")

# data cleaning about _comma_
train['prompt'] = train['prompt'].map(lambda x:x.replace('_comma_',','))
train['utterance'] = train['utterance'].map(lambda x:x.replace('_comma_',','))
valid['prompt'] = valid['prompt'].map(lambda x:x.replace('_comma_',','))
valid['utterance'] = valid['utterance'].map(lambda x:x.replace('_comma_',','))
test['prompt'] = test['prompt'].map(lambda x:x.replace('_comma_',','))
test['utterance'] = test['utterance'].map(lambda x:x.replace('_comma_',','))

print("訓練樣本數：", len(train))
print("驗證樣本數：", len(valid))
print("測試樣本數：", len(test))

訓練樣本數： 84169
驗證樣本數： 12078
測試樣本數： 10973


In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer 
from nltk.corpus import wordnet
from nltk import pos_tag

from nltk.corpus import stopwords
stoplist = stopwords.words("english")

#get pos
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

#lemmatizer
def lemmas(text):
    #token
    token = nltk.tokenize.word_tokenize(text.lower())
    
    #lemmatize        
    tagged_sent = pos_tag(token)#get pos     
    wnl = WordNetLemmatizer()#init lemmatizer
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) 
    
    #delete stopword
    lemmas_sent = [e for e in lemmas_sent if e not in stoplist]

    return ' '.join(lemmas_sent)

def del_stopword(text):
    #token
    token = nltk.tokenize.word_tokenize(text.lower())
    #delete stopword
    token = [e for e in token if e not in stoplist]
    return text

In [4]:
train['prompt'] = train['prompt'].apply(lemmas)
train['utterance'] = train['utterance'].apply(lemmas)
valid['prompt'] = valid['prompt'].apply(lemmas)
valid['utterance'] = valid['utterance'].apply(lemmas)
test['prompt'] = test['prompt'].apply(lemmas)
test['utterance'] = test['utterance'].apply(lemmas)

In [2]:
emotion_label = {'sad': 0, 'trusting': 1, 'terrified': 2, 'caring': 3, 'disappointed': 4,
         'faithful': 5, 'joyful': 6, 'jealous': 7, 'disgusted': 8, 'surprised': 9,
         'ashamed': 10, 'afraid': 11, 'impressed': 12, 'sentimental': 13, 
         'devastated': 14, 'excited': 15, 'anticipating': 16, 'annoyed': 17, 'anxious': 18,
         'furious': 19, 'content': 20, 'lonely': 21, 'angry': 22, 'confident': 23,
         'apprehensive': 24, 'guilty': 25, 'embarrassed': 26, 'grateful': 27,
         'hopeful': 28, 'proud': 29, 'prepared': 30, 'nostalgic': 31}
len(emotion_label)

32

In [3]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset
from IPython.display import clear_output

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmotionDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, df, tokenizer):
        assert mode in ["train", "valid", "test"]
        self.mode = mode
        self.df = df.fillna("")
        self.len = len(self.df)
        self.label_map = emotion_label
        self.tokenizer = tokenizer
    def __getitem__(self, idx):
        if self.mode == "test":
            prompt, utterance = self.df.iloc[idx, 2:].values
            label_tensor = None
        else:
            prompt, utterance, label = self.df.iloc[idx, 2:].values
            label_tensor = torch.tensor(label)
        
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_prompt = self.tokenizer.tokenize(prompt)
        word_pieces += tokens_prompt + ["[SEP]"]
        len_prompt = len(word_pieces)
        
        # 第二個句子的 BERT tokens
        tokens_utterance = self.tokenizer.tokenize(utterance)
        word_pieces += tokens_utterance + ["[SEP]"]
        len_utterance = len(word_pieces) - len_prompt
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        segments_tensor = torch.tensor([0] * len_prompt + [1] * len_utterance, 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
# 初始化 Dataset
trainset = EmotionDataset("train", train, tokenizer=tokenizer)
validset = EmotionDataset("valid", valid, tokenizer=tokenizer)

In [4]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)

    masks_tensors = torch.zeros(tokens_tensors.shape,
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0,1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

trainloader = DataLoader(trainset, batch_size=128, 
                         collate_fn=create_mini_batch)

validloader = DataLoader(validset, batch_size=128, 
                         collate_fn=create_mini_batch)

In [5]:
# 載入一個預訓練好可以做多分類任務的模型，n_class = 32
from transformers import BertForSequenceClassification
from IPython.display import clear_output

NUM_LABELS = 32
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS)

clear_output()

In [6]:
%%time
from tqdm.notebook import tqdm
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    running_loss = 0
    with torch.no_grad():
        for data in tqdm(dataloader):
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
                
            if not compute_acc:
                # 只是單純要回傳預測值的話，不用計算準確度也不用紀錄 loss
                tokens_tensors, segments_tensors, masks_tensors = data[:3]
                outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
                logits = outputs[0]
                _, pred = torch.max(logits.data, 1)
            else:
                # 否則就要計算 loss，這邊有一個小細節是 model 如果有吃 label 的話，
                # output[0]會變成是 loss，沒有吃 label 時 output[0] 會是 logits
                tokens_tensors, segments_tensors, masks_tensors, labels = data[:4]
                outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors,
                                labels=labels)
                loss = outputs[0]
                logits = outputs[1]
                _, pred = torch.max(logits.data, 1)
                running_loss += loss.item()
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
                
    
    if compute_acc:
        acc = correct / total
        loss = running_loss / total
        return predictions, acc, loss
    
    return predictions
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0
CPU times: user 1.24 s, sys: 778 ms, total: 2.02 s
Wall time: 2.4 s


In [7]:
%%time
from tqdm.notebook import tqdm
from transformers import get_linear_schedule_with_warmup

EPOCHS = 3

model.train()
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
# Create the learning rate scheduler.
total_steps = len(trainloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

training_stats = []

for epoch in range(EPOCHS):
    running_loss = 0.0
    for data in tqdm(trainloader):
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad()
        
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

    # 計算分類準確率
    _, train_acc, train_loss = get_predictions(model, trainloader, compute_acc=True)
    _, valid_acc, valid_loss = get_predictions(model, validloader, compute_acc=True)
    
    print('[epoch %d] train loss: %.3f, train acc: %.3f, valid loss: %.3f, valid acc: %.3f' %
          (epoch + 1, train_loss, train_acc, valid_loss, valid_acc))
    
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': train_loss,
            'Training Accur.': train_acc,
            'Valid. Loss': valid_loss,
            'Valid. Accur.': valid_acc
        }
    )

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

[epoch 1] train loss: 0.011, train acc: 0.593, valid loss: 0.012, valid acc: 0.542


  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

[epoch 2] train loss: 0.008, train acc: 0.668, valid loss: 0.011, valid acc: 0.562


  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

[epoch 3] train loss: 0.007, train acc: 0.711, valid loss: 0.011, valid acc: 0.570
CPU times: user 25min 35s, sys: 9min 43s, total: 35min 18s
Wall time: 35min 5s


In [8]:
import pandas as pd
pd.set_option('precision', 3)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
df_stats = df_stats.style.set_table_styles([dict(selector="td",props=[('max-width', '70px')])])
df_stats

Unnamed: 0_level_0,Training Loss,Training Accur.,Valid. Loss,Valid. Accur.
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.011,0.593,0.012,0.542
2,0.008,0.668,0.011,0.562
3,0.007,0.711,0.011,0.57


In [9]:
torch.save(model.state_dict(), 'checkpoint.pt')

In [10]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=NUM_LABELS)
model.load_state_dict(torch.load("checkpoint.pt"))
model.to(device)
model.eval()
clear_output()

In [11]:
testset = EmotionDataset("test", test, tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=128, 
                        collate_fn=create_mini_batch)
predictions = get_predictions(model, testloader)
predictions = predictions.cpu().numpy()

  0%|          | 0/86 [00:00<?, ?it/s]

In [12]:
def most_common(lst):
    return max(set(lst), key=lst.count)

conv_id_clean_test_list = list(dict.fromkeys(test.conv_id.tolist()))
conv_id_test_list = test.conv_id.tolist()
output = []
index = 0
for ids in conv_id_clean_test_list:
    temp = []
    for i in range(conv_id_test_list.count(ids)):
        temp.append(predictions[index])
        index+=1
    for i in range(conv_id_test_list.count(ids)):
        output.append(most_common(temp))

In [13]:
test_dict = dict(enumerate(output))

In [14]:
import csv
with open('output.csv', 'w', newline='') as csvfile:
    fieldnames = ['', 'pred']
    writer = csv.writer(csvfile)
    writer.writerow(fieldnames)
    for k, v in test_dict.items():
        writer.writerow([k, v])