# HW4 RNN
Task: Sentiment classification on Twitter comments.

Goal of this homework:
*   Get familiar with the recurrent neural network.
*   Learn how to deal with text data

TA: Chih-Kai, Yang (b08202033@ntu.edu.tw)



Import packages.

In [1]:
import torch
import os
import csv
import random
import numpy as np
import pandas as pd
import torch.nn.functional as F
import keras

from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

Download the dataset and unzip it.

In [2]:
#!gdown --id "1cwPgbbAMNPZ9nCoyOW2WuavimYymCKKy" --output DATASET.zip
!wget https://www.dropbox.com/s/0abp8vvib4j1gjw/HW4_dataset.zip 
!mv HW4_dataset.zip DATASET.zip
!unzip DATASET.zip

--2022-11-25 12:44:40--  https://www.dropbox.com/s/0abp8vvib4j1gjw/HW4_dataset.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.85.18, 2620:100:6035:18::a27d:5512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.85.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/0abp8vvib4j1gjw/HW4_dataset.zip [following]
--2022-11-25 12:44:40--  https://www.dropbox.com/s/raw/0abp8vvib4j1gjw/HW4_dataset.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucd16a5fa8b34dc49de2a06bf07f.dl.dropboxusercontent.com/cd/0/inline/BxbRJ_XChJAlkvGQfyw0jvFXOymifvvoxr-x2LaxoKrGTTkfqN-7Pm-Iq9OiZUiRN6qXXiTZ0G_mybj-wC1EIb5Jsyax6wiN5CKjuqNFBw2ypmEe45TXeN-Z3miF7sIENSUU9T4XmLjtbW3R3hJiW5e-AXqgdedhFvQkMAwDUXyYoA/file# [following]
--2022-11-25 12:44:40--  https://ucd16a5fa8b34dc49de2a06bf07f.dl.dropboxusercontent.com/cd/0/inline/BxbRJ_XChJAlkvGQfyw0jvFXOymifvvoxr-x2LaxoKrGTTkfqN-7Pm-Iq9OiZUiRN6qX

In [3]:
!pip list

Package                       Version
----------------------------- ----------------------
absl-py                       1.3.0
aeppl                         0.0.33
aesara                        2.7.9
aiohttp                       3.8.3
aiosignal                     1.3.1
alabaster                     0.7.12
albumentations                1.2.1
altair                        4.2.0
appdirs                       1.4.4
arviz                         0.12.1
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
async-timeout                 4.0.2
asynctest                     0.13.0
atari-py                      0.2.9
atomicwrites                  1.4.1
attrs                         22.1.0
audioread                     3.0.0
autograd                      1.5
Babel                         2.11.0
backcall                      0.2.0
beautifulsoup4                4.6.3
bleach                        5.0.1
blis                          0.7.9
bokeh

In [5]:
!unzip DATASET.zip
#!unzip HW4_dataset.zip

Archive:  DATASET.zip
replace HW4_dataset/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


Basic setup of hyperparameters

In [6]:
BATCH_SIZE = 256
EPOCH_NUM = 50
MAX_POSITIONS_LEN = 100
SEED = 97562246875 % (2**32-1) # Set your lucky number as the random seed
MODEL_DIR = 'model.pth'
lr = 5 * 1e-4

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

w2v_config = {'path': 'w2v.model', 'dim': 128}
net_config = {'hidden_dim': 64, 'num_layers': 3, 'bidirectional': False, 'fix_embedding': True}
header_config = {'dropout': 0.5, 'hidden_dim': 64}
assert header_config['hidden_dim'] == net_config['hidden_dim'] or header_config['hidden_dim'] == net_config['hidden_dim'] * 2

Auxiliary functions and classes definition

In [7]:
def common_words(text):
    appear = {}
    common = set()
    COMMON_TIMES = 5
    for sentence in text:
        for word in sentence:
            if (len(word) == 0 or word[0] == '@'):
                continue
            if (word not in common):
                if (word not in appear):
                    appear[word] = 0
                else:
                    appear[word] += 1
                if (appear[word] == COMMON_TIMES):
                    common.add(word)
                    del appear[word]
    print(len(common), common)
    return common


def parsing_text(text, common):
    common_count = 0
    total_count = 0
    for i in range (len(text)):
        total_count += len(text[i])
        new_sentence = []
        for word in text[i]:
            if (word == ""):
                continue
            if (word[-1] in [',', '.', '!', '?']):
                temp = word[-1]
                if (word in common):
                    new_sentence.append(word.replace(temp, ""))
                else:
                    new_sentence.append("peppa_pig")
                new_sentence.append(temp)
            elif (word in common):
                new_sentence.append(word)
            elif (word[0] == '@'):
                new_sentence.append("@")
            else:
                new_sentence.append("peppa_pig")
        if (len(new_sentence) > 100):
            new_sentence = new_sentence[0:99]
        text[i] = new_sentence
        common_count += len(text[i])
    #print("total length before deleting: ", total_count)
    #print("total length after deleting: ", common_count)
    return text

def load_train_label(path='HW4_dataset/train.csv'):
    tra_lb_pd = pd.read_csv(path)
    label = torch.FloatTensor(tra_lb_pd['label'].values)
    idx = tra_lb_pd['id'].tolist()
    text = [s.split(' ') for s in tra_lb_pd['text'].tolist()]
    return idx, text, label
    text.replace("# ", ", ")

def load_train_nolabel(path='HW4_dataset/train_nolabel.csv'):
    tra_nlb_pd = pd.read_csv(path)
    text = [s.split(' ') for s in tra_nlb_pd['text'].tolist()]
    return text

def load_test(path='HW4_dataset/test.csv'):
    tst_pd = pd.read_csv(path)
    idx = tst_pd['id'].tolist()
    text = [s.split(' ') for s in tst_pd['text'].tolist()]
    return idx, text



train_idx, train_label_text, label = load_train_label('HW4_dataset/train.csv')
test_idx, test_text = load_test('HW4_dataset/test.csv')
train_nolabel_text = load_train_nolabel('HW4_dataset/train_nolabel.csv')

print("train data size:", len(train_idx))
print("test data size:", len(test_idx))
for i in [1234, 2206, 3998, 5561, 55988]:
  print(train_label_text[i], label[i])

common_word = common_words(train_label_text)
train_label_text = parsing_text(train_label_text, common_word)
train_nolabel_text = parsing_text(train_nolabel_text, common_word)
test_text = parsing_text(test_text, common_word)

for i in [1234, 2206, 3998, 5561, 55988]:
  print(train_label_text[i], label[i])

train data size: 170000
test data size: 40000
['@questlove', 'Great', 'live', 'show', 'last', 'time', 'I', 'seen', 'ya!', ''] tensor(1.)
['@xfallenxangelxx', 'you', 'take', 'it', 'in', 'the', 'eye', ''] tensor(1.)
['@doesthishurt', 'not', 'great', ''] tensor(0.)
['@alexm237', 'thanks', 'again', '', "I'll", 'txt', 'u', 'in', 'a', 'bit'] tensor(1.)
['convertible', 'O.o', 'what', 'kind?', 'can', 'we', 'have', 'some', 'sparks', 'to?', 'i', 'like', 'sparks', ''] tensor(1.)
['@', 'Great', 'live', 'show', 'last', 'time', 'I', 'seen', 'ya', '!'] tensor(1.)
['@', 'you', 'take', 'it', 'in', 'the', 'eye'] tensor(1.)
['@', 'not', 'great'] tensor(0.)
['@', 'thanks', 'again', "I'll", 'txt', 'u', 'in', 'a', 'bit'] tensor(1.)
['peppa_pig', 'O.o', 'what', 'kind', '?', 'can', 'we', 'have', 'some', 'peppa_pig', 'to', '?', 'i', 'like', 'peppa_pig'] tensor(1.)


In [8]:
class Preprocessor:
    def __init__(self, sentences, w2v_config):
        self.sentences = sentences
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
        self.build_word2vec(sentences, **w2v_config)
        
    def build_word2vec(self, x, path, dim):
        if os.path.isfile(path):
            print("loading word2vec model ...")
            w2v_model = Word2Vec.load(path)
        else:
            print("training word2vec model ...")
            w2v_model = Word2Vec(x, size=dim, window=5, min_count=2, workers=12, iter=2, sg=1)
            print("saving word2vec model ...")
            w2v_model.save(path)
            
        self.embedding_dim = w2v_model.vector_size
        for i, word in enumerate(w2v_model.wv.vocab):
            #e.g. self.word2index['he'] = 1 
            #e.g. self.index2word[1] = 'he'
            #e.g. self.vectors[1] = 'he' vector
            
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(w2v_model.wv[word])
        
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        print("total words: {}".format(len(self.embedding_matrix)))
        
    def add_embedding(self, word):
        # 把 word 加進 embedding，並賦予他一個隨機生成的 representation vector
        # word 只會是 "<PAD>" 或 "<UNK>"
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)   
        
    def sentence2idx(self, sentence):
        sentence_idx = []
        for word in sentence:
            if word in self.word2idx.keys():
                sentence_idx.append(self.word2idx[word])
            else:
                sentence_idx.append(self.word2idx["<UNK>"])
        return torch.LongTensor(sentence_idx)

preprocessor = Preprocessor(train_label_text, w2v_config)
print(1000, len(train_label_text), train_label_text[3356])
print(2000, len(preprocessor.sentences), preprocessor.sentences[3356])
print(3000, preprocessor.sentence2idx(train_label_text[3356]))
print(4000, len(preprocessor.idx2word), preprocessor.idx2word[2340])
print(5000, len(preprocessor.word2idx), preprocessor.word2idx["seems"])
print(6000, preprocessor.embedding_matrix.size(), preprocessor.embedding_matrix)
print(7000, preprocessor.embedding_matrix[2340])



train_idx, valid_idx, train_label_text, valid_label_text, train_label, valid_label = train_test_split(train_idx, train_label_text, label, test_size=0.2)
print(8000, len(train_idx), train_label.size())



#TODO: add nolabel
#for i in range (len(train_nolabel_text)):
#    train_idx.append(None)
#    train_label_text.append(train_nolabel_text[i])

#temp = [train_label.unsqueeze_(0)]
#temp.append(torch.tensor([0 for i in range(len(train_nolabel_text))]).unsqueeze_(0))
#print(temp)
#train_label = torch.cat(temp, 0)
#print(len(train_idx), train_label.size())


training word2vec model ...
saving word2vec model ...
total words: 13602
1000 170000 ['I', 'would', 'be', 'ever', 'so', 'grateful', 'if', 'people', 'could', 'watch', 'my', 'video', ',', 'peppa_pig', 'peppa_pig', 'thank', 'you']
2000 170000 ['I', 'would', 'be', 'ever', 'so', 'grateful', 'if', 'people', 'could', 'watch', 'my', 'video', ',', 'peppa_pig', 'peppa_pig', 'thank', 'you']
3000 tensor([  34,  316,   83,  428,  272, 5070,  758,  388,  591,  398,    3, 1320,
          56,    5,    5,  179,   20])
4000 13602 supporting
5000 13602 1984
6000 torch.Size([13602, 128]) tensor([[ 8.3140e-02,  2.2528e-01,  3.4635e-02,  ...,  4.4835e-02,
         -5.3910e-02,  1.0514e-02],
        [-5.8186e-02,  1.5624e-01, -1.4113e-02,  ...,  1.1690e-01,
          6.5398e-02, -1.0914e-01],
        [-3.5234e-01,  6.3453e-01, -3.9953e-01,  ..., -7.6583e-04,
         -7.1426e-02, -2.5929e-01],
        ...,
        [-2.2114e-02, -8.8370e-03,  9.2222e-02,  ...,  5.5194e-02,
          5.5352e-02, -7.1455e-03],




In [9]:
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, id_list, sentences, labels, preprocessor):
        self.id_list = id_list
        self.sentences = sentences
        self.labels = labels
        self.preprocessor = preprocessor
    
    def __getitem__(self, idx):
        if self.labels is None: return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx])
        return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx]), self.labels[idx]
    
    def __len__(self):
        return len(self.sentences)
    
    def collate_fn(self, data):
        id_list = torch.LongTensor([d[0] for d in data])
        lengths = torch.LongTensor([len(d[1]) for d in data])
        texts = pad_sequence(
            [d[1] for d in data], batch_first=True).contiguous()
     
        if self.labels == None: 
            # print("none, ", id_list, lengths, texts)
            return id_list, lengths, texts
        else:
          labels = torch.FloatTensor([d[2] for d in data])
          if (random.randint(1,1000000) == 662837):
              print("not none")
              print(id_list)
              print(lengths)
              print(texts)
              print(labels)
          return id_list, lengths, texts, labels


train_dataset = TwitterDataset(train_idx, train_label_text, train_label, preprocessor)
#train_nodataset = TwitterDataset(None, train_nolabel_text, None, preprocessor)
valid_dataset = TwitterDataset(valid_idx, valid_label_text, valid_label, preprocessor)
test_dataset = TwitterDataset(test_idx, test_text, None, preprocessor)

print(len(train_dataset.id_list))
print(train_dataset.id_list[96112], train_dataset.sentences[96112], train_dataset.labels[96112])
print(train_dataset.__len__())
print(train_dataset.__getitem__(96112))
# print(train_dataset.collate_fn())

train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = True,
                                            collate_fn = train_dataset.collate_fn,
                                            num_workers = 8)
valid_loader = torch.utils.data.DataLoader(dataset = valid_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = valid_dataset.collate_fn,
                                            num_workers = 8)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = test_dataset.collate_fn,
                                            num_workers = 8)

136000
169833 ["I'm", 'out', 'of', 'peppa_pig', '!', "Can't", 'install', 'Windows', '7'] tensor(0.)
136000
(169833, tensor([ 116,  274,   12,    5,   18,  704, 8822, 3515,  865]), tensor(0.))


  cpuset_checked))


Definition of RNN network

In [10]:
class Backbone(torch.nn.Module):
    def __init__(self, embedding, hidden_dim, num_layers, bidirectional, fix_embedding=True):
        super(Backbone, self).__init__()
        #self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        #self.embedding.weight = torch.nn.Parameter(embedding)
        #self.embedding = torch.nn.Embedding.from_pretrained(torch.FloatTensor(Word2Vec.load("w2v.model").wv.vectors))
        #self.embedding.weight = torch.nn.Parameter(torch.cat([self.embedding.weight, torch.zeros(1,64), torch.zeros(1,64)], 0))
        self.embedding = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding))
        #self.embedding = keras.layers.Embedding(input_dim=embedding.size(0),
        #                    output_dim=embedding.size(1),
        #                    weights=[embedding],
        #                    trainable = False)
        #self.embedding.weight.requires_grad = False if fix_embedding else True
        self.net = torch.nn.RNN(embedding.size(1), hidden_dim, num_layers=num_layers, \
                                 bidirectional=bidirectional, batch_first=True)
        self.count = 0
        
    def forward(self, inputs): # torch.Size([256, length])
            
        inputs2 = self.embedding(inputs) # torch.Size([256, length, 64])
        x, _ = self.net(inputs2) # torch.Size([256, length, 32])

        self.count += 1
        if (self.count == -1):
            print(self.embedding.weight, self.embedding.weight.size())
        #if (self.count == -1):
        #    print(inputs.size(), inputs) 
        #    print(inputs2.size(), inputs2) 
        #    print(x.size(), x) 
        return x
    
class Header(torch.nn.Module):
    def __init__(self, dropout, hidden_dim):
        super(Header, self).__init__()
        # TODO: you should design your classifier module
        self.classifier = torch.nn.Sequential(
                            torch.nn.Linear(hidden_dim,1),
                            torch.nn.Sigmoid(),
                            )
        self.count = 0
        
    @ torch.no_grad()
    def _get_length_masks(self, lengths):
        # lengths: (batch_size, ) in cuda
        ascending = torch.arange(MAX_POSITIONS_LEN)[:lengths.max().item()].unsqueeze(
            0).expand(len(lengths), -1).to(lengths.device)
        length_masks = (ascending < lengths.unsqueeze(-1)).unsqueeze(-1)
        return length_masks
    
    def forward(self, inputs, lengths):
        # the input shape should be (N, L, D∗H)
        pad_mask = self._get_length_masks(lengths) # 256, 27, 1 (bool)
        inputs2 = inputs * pad_mask # 256, 27, 32
        inputs3 = inputs2.sum(dim=1) # 256, 32
        out = self.classifier(inputs3).squeeze() # 256

        self.count += 1
        if (self.count == -1):
            print(pad_mask.size(), pad_mask) # 256, 27, 1 (bool)
            print(inputs.size(), inputs) # 256, 27, 32
            print(inputs2.size(), inputs2) # 256, 32
            print(inputs3.size(), inputs3) # 256, 32
            print(out.size(), out) # 256
        return out



backbone = Backbone(preprocessor.embedding_matrix, **net_config)
header = Header(**header_config)
#print(preprocessor.embedding_matrix, preprocessor.embedding_matrix.size())
#print(backbone.embedding.weight, backbone.embedding.weight.size())
#tempmodel = torch.load("model.pth")
#backbone = tempmodel["backbone"]
#header = tempmodel["header"]

Trainer

In [20]:
def train(train_loader, backbone, header, optimizer, criterion, device, epoch):

    total_loss = []
    total_acc = []
    
    for i, (idx_list, lengths, texts, labels) in enumerate(train_loader):
        lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        if not backbone is None:
            inputs = backbone(inputs)
        soft_predicted = header(inputs, lengths)
        loss = criterion(soft_predicted, labels)
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            hard_predicted = (soft_predicted >= 0.5).int()
            correct = sum(hard_predicted == labels).item()
            batch_size = len(labels)
        
            print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f}'.format(epoch+1, np.mean(total_loss), np.mean(total_acc)), end='\r')
    backbone.train()
    header.train()
    return np.mean(total_loss), np.mean(total_acc)

def valid(valid_loader, backbone, header, criterion, device, epoch):
    backbone.eval()
    header.eval()
    with torch.no_grad():
        total_loss = []
        total_acc = []
        
        for i, (idx_list, lengths, texts, labels) in enumerate(valid_loader):
            lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)

            if not backbone is None:
                inputs = backbone(inputs)
            soft_predicted = header(inputs, lengths)
            loss = criterion(soft_predicted, labels)
            total_loss.append(loss.item())
            
            hard_predicted = (soft_predicted >= 0.5).int()
            correct = sum(hard_predicted == labels).item()
            acc = correct * 100 / len(labels)
            total_acc.append(acc)
            
            print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f}'.format(epoch+1, np.mean(total_loss), np.mean(total_acc)), end='\r')
    backbone.train()
    header.train()
    return np.mean(total_loss), np.mean(total_acc)

            
def run_training(train_loader, valid_loader, backbone, header, epoch_num, lr, device, model_dir): 
    def check_point(backbone, header, loss, acc, model_dir):
        for i in range (len(acc)-1):
            if (acc[i] > acc[-1]):
                print("target: ", acc[i], "now: ", acc[-1])
                return
        print("model saved")
        torch.save({'backbone': backbone, 'header': header}, model_dir)

    def is_stop(loss, acc):
        if (len(loss) < 5):
            return False
        if ((loss[-1] > loss[-2]) and (loss[-2] > loss[-3]) and (loss[-3] > loss[-4]) and (loss[-4] > loss[-5])):
            return True
        return False

    
    if backbone is None:
        trainable_paras = header.parameters()
    else:
        trainable_paras = list(backbone.parameters()) + list(header.parameters())
        
    optimizer = torch.optim.SGD(trainable_paras, lr=lr)
    
    backbone.train()
    header.train()
    backbone = backbone.to(device)
    header = header.to(device)
    criterion = torch.nn.BCELoss()
    loss_record = []
    acc_record = []
    for epoch in range(100):
        train(train_loader, backbone, header, optimizer, criterion, device, epoch)
        loss, acc = valid(valid_loader, backbone, header, criterion, device, epoch)
        loss_record.append(loss)
        acc_record.append(acc)
        print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f} '.format(epoch+1, loss, acc))
        check_point(backbone, header, loss_record, acc_record, model_dir)
        if is_stop(loss_record, acc_record):
            break


run_training(train_loader, valid_loader, backbone, header, EPOCH_NUM, lr, device, MODEL_DIR)

[Validation in epoch 1] loss:0.653 acc:62.376 
model saved
[Validation in epoch 2] loss:0.636 acc:64.727 
model saved
[Validation in epoch 3] loss:0.628 acc:65.813 
model saved
[Validation in epoch 4] loss:0.623 acc:66.380 
model saved
[Validation in epoch 5] loss:0.620 acc:66.698 
model saved
[Validation in epoch 6] loss:0.617 acc:66.874 
model saved
[Validation in epoch 7] loss:0.615 acc:67.132 
model saved
[Validation in epoch 8] loss:0.613 acc:67.342 
model saved
[Validation in epoch 9] loss:0.611 acc:67.577 
model saved
[Validation in epoch 10] loss:0.610 acc:67.579 
model saved
[Validation in epoch 11] loss:0.609 acc:67.734 
model saved
[Validation in epoch 12] loss:0.607 acc:67.951 
model saved
[Validation in epoch 13] loss:0.608 acc:67.810 
target:  67.95090189415848 now:  67.81015037593986
[Validation in epoch 14] loss:0.605 acc:68.199 
model saved
[Validation in epoch 15] loss:0.604 acc:68.201 
model saved
[Validation in epoch 16] loss:0.603 acc:68.421 
model saved
[Validatio

Testing

In [10]:
def run_testing(test_loader, backbone, header, device, output_path):
  with open(output_path, 'w') as f:
    backbone.eval()
    header.eval()
    writer = csv.writer(f)
    writer.writerow(['id', 'label'])
    with torch.no_grad():
      for i, (idx_list, lengths, texts) in enumerate(test_loader):
        lengths, inputs = lengths.to(device), texts.to(device)
        if not backbone is None:
          inputs = backbone(inputs)
        soft_predicted = header(inputs, lengths)
        hard_predicted = (soft_predicted >= 0.5).int()
        for i, p in zip(idx_list, hard_predicted):
          writer.writerow([str(i.item()), str(p.item())])

Make a submission file

(Note: In principle, you don't need to modify this part, and please make sure that you follow the correct format of the produced files.)

In [11]:
EXP_name = "SampleCode"
pred_file = f'{EXP_name}-pred.csv'
run_testing(test_loader, backbone, header, device, pred_file)

In [12]:
from google.colab import files
files.download(pred_file)
files.download(MODEL_DIR)
files.download("w2v.model")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
tempmodel = torch.load("model.pth")
backbone = tempmodel["backbone"]
header = tempmodel["header"]

texts = ["You know the rules, and so do I.", "I know the rules, and so do You.", "Looks great, Zuck"]
temp_idx = [i for i in range (3)]
temp_text = [text.split(' ') for text in texts]
temp_text = parsing_text(temp_text, common_word)

temp_dataset = TwitterDataset(temp_idx, temp_text, None, preprocessor)
temp_loader = torch.utils.data.DataLoader(dataset = temp_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = test_dataset.collate_fn,
                                            num_workers = 8)
tempfile = "a.csv"
run_testing(temp_loader, backbone, header, device, tempfile)

print("----------Input sentences----------")
for text in texts:
  print(text)

print("----------Outputs of my model----------")
with open(tempfile, newline='') as csvfile:
  rows = csv.reader(csvfile)
  for row in rows:
    print(row)

----------Input sentences----------
You know the rules, and so do I.
I know the rules, and so do You.
Looks great, Zuck
----------Outputs of my model----------
['id', 'label']
['0', '0']
['1', '1']
['2', '1']


# Good luck for your programming assignments! 
If you have any questions, feel free to send e-mails to ntueemlta2022@gmail.com / b08202033@ntu.edu.tw. Of course, welcome to make use of the TA hours as well. 