In [1]:
!nvidia-smi

Thu May  5 07:17:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:18:00.0 Off |                    0 |
| N/A   42C    P0    45W / 300W |      0MiB / 16160MiB |      0%   E. Process |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   39C    P0    44W / 300W |      0MiB / 16160MiB |      0%   E. Process |
|       

In [2]:
!pip install transformers SentencePiece torch tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting SentencePiece
  Downloading sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 8.8 MB/s eta 0:00:01
Installing collected packages: SentencePiece
Successfully installed SentencePiece-0.1.96
You should consider upgrading via the '/share/pkg.7/python3/3.8.6/install/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [3]:
import math

from tqdm import tqdm
import numpy as np
from transformers import MT5ForConditionalGeneration, T5Tokenizer
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score

In [4]:
class SoftEmbedding(nn.Module):
    def __init__(self, 
                wte: nn.Embedding,
                n_tokens: int = 10, 
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        """appends learned embedding to 
        Args:
            wte (nn.Embedding): original transformer word embedding
            n_tokens (int, optional): number of tokens for task. Defaults to 10.
            random_range (float, optional): range to init embedding (if not initialize from vocab). Defaults to 0.5.
            initialize_from_vocab (bool, optional): initalizes from default vocab. Defaults to True.
        """
        super(SoftEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                                  n_tokens, 
                                                                                  random_range, 
                                                                                  initialize_from_vocab))
            
    def initialize_embedding(self, 
                             wte: nn.Embedding,
                             n_tokens: int = 10, 
                             random_range: float = 0.5, 
                             initialize_from_vocab: bool = True):
        """initializes learned embedding
        Args:
            same as __init__
        Returns:
            torch.float: initialized using original schemes
        """
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(n_tokens, wte.weight.size(1)).uniform_(-random_range, random_range)
            
    def forward(self, tokens):
        """run forward pass
        Args:
            tokens (torch.long): input tokens before encoding
        Returns:
            torch.float: encoding of text concatenated with learned task specifc embedding
        """
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

In [5]:
# !pip install zh-dataset-inews

In [6]:
# from zh_dataset_inews import title_train, label_train, title_dev, label_dev, title_test, label_test

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')
# path='drive/MyDrive/CS505/CS505-Final/'
path=''

In [8]:
import os
eng_path=path+'data/2017_English_final/GOLD/Subtask_A/'
train_file=[]
val_file=[]

for root, dirs, files in os.walk(eng_path):
    for file_name in files:
        if 'train' in file_name and '.txt' in file_name:
            train_file.append(os.path.join(eng_path, file_name))
        if 'dev' in file_name and '.txt' in file_name:
            val_file.append(os.path.join(eng_path, file_name))
print(train_file)
print(val_file)


['data/2017_English_final/GOLD/Subtask_A/twitter-2015train-A.txt', 'data/2017_English_final/GOLD/Subtask_A/twitter-2013train-A.txt', 'data/2017_English_final/GOLD/Subtask_A/twitter-2016train-A.txt']
['data/2017_English_final/GOLD/Subtask_A/twitter-2016devtest-A.txt', 'data/2017_English_final/GOLD/Subtask_A/twitter-2013dev-A.txt', 'data/2017_English_final/GOLD/Subtask_A/twitter-2016dev-A.txt']


In [9]:
title_train = []
label_train = []
title_dev = []
label_dev = []


sentiment_to_label = {'positive': 2, 'neutral': 1, 'negative': 0}

for file_path in train_file:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for l in lines:
            entries = l.split('\t')
            title_train.append(entries[2])
            label_train.append(sentiment_to_label[entries[1]])
            
    
for file_path in val_file:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for l in lines:
            entries = l.split('\t')
            title_dev.append(entries[2])
            label_dev.append(sentiment_to_label[entries[1]])
            



In [10]:
arabic_path=path+'data/2017_Arabic_train_final/GOLD/SemEval2017-task4-train.subtask-A.arabic.txt'

title_test = []
label_test = []

sentiment_to_label = {'positive': 2, 'neutral': 1, 'negative': 0}

with open(arabic_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for l in lines:
        entries = l.split('\t')
        title_test.append(entries[2])
        label_test.append(sentiment_to_label[entries[1]])

In [11]:
def generate_data(batch_size, n_tokens, title_data, label_data):

    labels = [
        torch.tensor([[3]]),  # \x00
        torch.tensor([[4]]),  # \x01
        torch.tensor([[5]]),  # \x02
    ]

    def yield_data(x_batch, y_batch, l_batch):
        x = torch.nn.utils.rnn.pad_sequence(x_batch, batch_first=True)
        y = torch.cat(y_batch, dim=0)
        m = (x > 0).to(torch.float32)
        decoder_input_ids = torch.full((x.size(0), n_tokens), 1)
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
            m = m.cuda()
            decoder_input_ids = decoder_input_ids.cuda()
        return x, y, m, decoder_input_ids, l_batch

    x_batch, y_batch, l_batch = [], [], []
    for x, y in zip(title_data, label_data):
        context = x
        inputs = tokenizer(context, return_tensors="pt")
        inputs['input_ids'] = torch.cat([torch.full((1, n_tokens), 1), inputs['input_ids']], 1)
        l_batch.append(y)
        y = labels[y]
        y = torch.cat([torch.full((1, n_tokens - 1), -100), y], 1)
        x_batch.append(inputs['input_ids'][0])
        y_batch.append(y)
        if len(x_batch) >= batch_size:
            yield yield_data(x_batch, y_batch, l_batch)
            x_batch, y_batch, l_batch = [], [], []

    if len(x_batch) > 0:
        yield yield_data(x_batch, y_batch, l_batch)
        x_batch, y_batch, l_batch = [], [], []

In [12]:
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
n_tokens = 512
s_wte = SoftEmbedding(model.get_input_embeddings(), 
                      n_tokens=n_tokens, 
                      initialize_from_vocab=True)
model.set_input_embeddings(s_wte)
if torch.cuda.is_available():
    model = model.cuda()

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=4309802.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=65.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=376.0), HTML(value='')))




In [13]:
parameters = list(model.parameters())
for x in parameters[1:]:
    x.requires_grad = False

In [14]:
parameters[0]

Parameter containing:
tensor([[ 1.7500e+00, -1.6719e+00,  2.4062e+00,  ...,  6.9580e-03,
         -9.8828e-01, -4.6875e-01],
        [ 8.5625e+00,  5.5625e+00, -1.7109e+00,  ...,  7.7812e+00,
         -5.2812e+00, -3.2188e+00],
        [ 6.8750e-01, -4.5312e-01,  5.7812e-01,  ...,  7.3828e-01,
         -3.0078e-01,  2.0312e-01],
        ...,
        [-3.1250e+00, -7.0938e+00,  2.7812e+00,  ..., -1.0688e+01,
          4.5312e+00, -3.0156e+00],
        [-7.0625e+00, -7.4688e+00,  1.3875e+01,  ..., -4.9062e+00,
         -4.5625e+00,  7.4375e+00],
        [-1.0400e-01,  9.0000e+00,  3.3281e+00,  ...,  6.1250e+00,
          1.0750e+01, -1.0107e-01]], device='cuda:0', requires_grad=True)

In [15]:
parameters[2]

Parameter containing:
tensor([[-1.3977e-02,  3.8818e-02,  5.7129e-02,  ...,  4.9316e-02,
         -8.1177e-03, -3.8147e-03],
        [ 6.3965e-02, -1.0193e-02, -2.0020e-02,  ..., -8.3618e-03,
         -1.1902e-02, -2.6978e-02],
        [-1.6357e-02, -4.4922e-02,  4.8584e-02,  ..., -1.6479e-02,
         -4.0039e-02,  6.3782e-03],
        ...,
        [ 7.7820e-03, -6.5918e-03, -3.9062e-03,  ...,  1.9165e-02,
          7.4863e-05, -2.6001e-02],
        [-1.4587e-02,  1.8433e-02, -2.6489e-02,  ..., -3.9062e-02,
         -4.0527e-02,  4.1992e-02],
        [ 7.8125e-02,  1.6602e-02,  6.4941e-02,  ...,  4.2152e-04,
          4.5166e-02, -1.1780e-02]], device='cuda:0')

In [16]:
for x, y, m, dii, true_labels in generate_data(2, n_tokens, title_train, label_train):
    assert dii.shape == y.shape
    outputs = model(input_ids=x, labels=y, attention_mask=m, decoder_input_ids=dii)
    assert outputs['logits'].shape[:2] == y.shape
    pred_labels = outputs['logits'][:, -1, 3:6].argmax(-1).detach().cpu().numpy().tolist()
    break

In [17]:
batch_size =2
n_epoch = 2
total_batch = math.ceil(len(title_train) / batch_size)
dev_total_batch = math.ceil(len(title_dev) / batch_size)
use_ce_loss = False
ce_loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(s_wte.parameters(), lr=0.5)

for epoch in range(n_epoch):
    print('epoch', epoch)

    all_true_labels = []
    all_pred_labels = []
    losses = []
    pbar = tqdm(enumerate(generate_data(batch_size, n_tokens, title_train, label_train)), total=total_batch)
    for i, (x, y, m, dii, true_labels) in pbar:
        all_true_labels += true_labels
        
        optimizer.zero_grad()
        outputs = model(input_ids=x, labels=y, attention_mask=m, decoder_input_ids=dii)
        pred_labels = outputs['logits'][:, -1, 3:6].argmax(-1).detach().cpu().numpy().tolist()
        all_pred_labels += pred_labels

        if use_ce_loss:
            logits = outputs['logits'][:, -1, 3:6]
            true_labels_tensor = torch.tensor(true_labels, dtype=torch.long).cuda()
            loss = ce_loss(logits, true_labels_tensor)
        else:
            loss = outputs.loss
        loss.backward()
        optimizer.step()
        loss_value = float(loss.detach().cpu().numpy().tolist()) / batch_size
        losses.append(loss_value)

        acc = accuracy_score(all_true_labels, all_pred_labels)
        pbar.set_description(f'train: loss={np.mean(losses):.4f}, acc={acc:.4f}')

    all_true_labels = []
    all_pred_labels = []
    losses = []
    with torch.no_grad():
        pbar = tqdm(enumerate(generate_data(batch_size, n_tokens, title_dev, label_dev)), total=dev_total_batch)
        for i, (x, y, m, dii, true_labels) in pbar:
            all_true_labels += true_labels
            outputs = model(input_ids=x, labels=y, attention_mask=m, decoder_input_ids=dii)
            loss = outputs.loss
            loss_value = float(loss.detach().cpu().numpy().tolist()) / batch_size
            losses.append(loss_value)
            pred_labels = outputs['logits'][:, -1, 3:6].argmax(-1).detach().cpu().numpy().tolist()
            all_pred_labels += pred_labels
            acc = accuracy_score(all_true_labels, all_pred_labels)
            pbar.set_description(f'dev: loss={np.mean(losses):.4f}, acc={acc:.4f}')

  0%|          | 0/8087 [00:00<?, ?it/s]

epoch 0


train: loss=2.8273, acc=0.3930: 100%|██████████| 8087/8087 [30:12<00:00,  4.46it/s] 
dev: loss=0.6558, acc=0.4267: 100%|██████████| 2827/2827 [05:03<00:00,  9.33it/s]
  0%|          | 0/8087 [00:00<?, ?it/s]

epoch 1


train: loss=2.6818, acc=0.4103: 100%|██████████| 8087/8087 [30:09<00:00,  4.47it/s]
dev: loss=1.2859, acc=0.4265: 100%|██████████| 2827/2827 [05:04<00:00,  9.29it/s]


In [18]:
parameters2 = list(model.parameters())

In [19]:
parameters2[0]

Parameter containing:
tensor([[ 28.8612, -22.3175, -50.1415,  ...,  -6.7055,   1.7269,  24.0909],
        [ 29.1193,  12.5699,  -8.7134,  ..., -23.2332,  -9.5085, -10.3435],
        [-29.0432,  30.6009, -16.5832,  ...,   2.1478,  15.7382,   2.1589],
        ...,
        [ 36.9844,  -1.8403,  11.0625,  ..., -44.9269,   5.5143,  22.6365],
        [-34.8258, -42.9783,  -4.9431,  ...,  23.6431,  25.1618,  60.8522],
        [ 50.4875, -21.9642, -32.0669,  ...,  -0.5277,  -6.1080,   0.5091]],
       device='cuda:0', requires_grad=True)

In [20]:
parameters2[2]

Parameter containing:
tensor([[-1.3977e-02,  3.8818e-02,  5.7129e-02,  ...,  4.9316e-02,
         -8.1177e-03, -3.8147e-03],
        [ 6.3965e-02, -1.0193e-02, -2.0020e-02,  ..., -8.3618e-03,
         -1.1902e-02, -2.6978e-02],
        [-1.6357e-02, -4.4922e-02,  4.8584e-02,  ..., -1.6479e-02,
         -4.0039e-02,  6.3782e-03],
        ...,
        [ 7.7820e-03, -6.5918e-03, -3.9062e-03,  ...,  1.9165e-02,
          7.4863e-05, -2.6001e-02],
        [-1.4587e-02,  1.8433e-02, -2.6489e-02,  ..., -3.9062e-02,
         -4.0527e-02,  4.1992e-02],
        [ 7.8125e-02,  1.6602e-02,  6.4941e-02,  ...,  4.2152e-04,
          4.5166e-02, -1.1780e-02]], device='cuda:0')

In [21]:
def predict(text):
    inputs = tokenizer(text, return_tensors='pt')
    inputs['input_ids'] = torch.cat([torch.full((1, n_tokens), 1), inputs['input_ids']], 1)

    decoder_input_ids = torch.full((1, n_tokens), 1)
    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'].cuda(), decoder_input_ids=decoder_input_ids.cuda())
    logits = outputs['logits'][:, -1, 3:6]
    pred = logits.argmax(-1).detach().cpu().numpy()[0]
    # print(logits)
    return pred

In [22]:
# train_rets = []
# for i in tqdm(range(len(title_train))):
#     pred = predict(title_train[i])
#     train_rets.append((label_train[i], pred, title_train[i]))

100%|██████████| 16173/16173 [14:32<00:00, 18.54it/s]


In [23]:
rets = []
for i in tqdm(range(len(title_test))):
    pred = predict(title_test[i])
    rets.append((label_test[i], pred, title_test[i]))

100%|██████████| 3355/3355 [03:02<00:00, 18.38it/s]


In [24]:
print(
    accuracy_score(
        [x[0] for x in train_rets],
        [x[1] for x in train_rets],
    )
)

0.4254621900698695


In [25]:
print(
    accuracy_score(
        [x[0] for x in rets],
        [x[1] for x in rets],
    )
)

0.2360655737704918


In [26]:
print(
    accuracy_score(
        [x[0] for x in rets],
        [0] * len(rets),
    ),
    accuracy_score(
        [x[0] for x in rets],
        [1] * len(rets),
    ),
    accuracy_score(
        [x[0] for x in rets],
        [2] * len(rets),
    )
)

0.34038748137108793 0.43815201192250375 0.22146050670640835


In [30]:
print(sum([1 for x in rets if x[1]==1]))

0
