In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from utils.utils import getDevice
from datasets.newsGroupDataset import NewsGroupDataset
import numpy as np
from tqdm import tqdm
import spacy
import string
import re
import torch
from torch.utils.data import DataLoader
from model.textClassificationModel import TextClassificationModel
import time
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch import nn



  from .autonotebook import tqdm as notebook_tqdm


(7, "From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n")


In [2]:
en = spacy.load('en_core_web_trf')

stopwords = en.Defaults.stop_words

print(stopwords)
print(len(stopwords))

{'our', 'then', 'throughout', 'so', 'six', 'latter', 'one', 'name', 'four', 'every', 'further', 'off', 'say', 'without', 'other', 'sixty', 'sometimes', 'elsewhere', 'ever', 'thereby', 'will', 'up', 'nowhere', 'but', 'seem', 'thru', 'can', 'everything', 'well', 'are', 'what', 'empty', 'put', 'each', 'move', 'used', 'when', 'also', '’ll', 'due', 'of', 'being', 'whereby', 'was', 'nor', 'once', 'become', 'whereupon', 'ourselves', 'does', 'why', 'hereupon', 'since', 'moreover', 'neither', 'more', '‘m', 'herself', '’re', 'an', 'yourselves', 'done', 'except', 'someone', 'yours', 'otherwise', "'d", 'very', 'own', 'that', 'here', 'around', 'mine', 'already', 'eight', 'their', 'all', 'amongst', 'she', 'were', 'above', 'anyone', 'fifty', 'seeming', 'take', 'thus', 'whereas', 'regarding', 'while', 'meanwhile', 'within', 'behind', 'i', 'about', 'still', 'something', 'again', 'as', 'how', 'among', 'during', 'hers', 'would', 'twelve', 'n‘t', 'toward', 'whoever', 'for', 'whereafter', 'noone', 'her', '

In [3]:




device = getDevice()

data_iter = NewsGroupDataset(subset='train')
# print(data_iter[0])

#tokenization

def tokenizer(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    tokens= [token.text for token in en.tokenizer(nopunct)]
    return [x for x in tokens if x not in stopwords]


def yield_tokens(data_iter):
    for _, text in tqdm(data_iter):
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(data_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

print(vocab.__getitem__('i'.lower()))

100%|██████████| 11314/11314 [00:24<00:00, 457.01it/s]


0


In [4]:
text_pipeline = lambda x: vocab(tokenizer(x))

print(text_pipeline("enhanced"))

[4543]


In [5]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = NewsGroupDataset(subset='train')
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

# a = 0
# for label, text, offset in tqdm(dataloader):
#     a = a+1

In [6]:
num_class = len(list(train_iter.target_names))
vocab_size = len(vocab)
emsize = 300
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
print(vocab_size)

# Hyperparameters
EPOCHS = 15 # epoch
LR = 5  # learning rate
BATCH_SIZE = 32 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

89000


In [7]:

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(tqdm(dataloader)):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(tqdm(dataloader)):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [8]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_dataset = NewsGroupDataset(subset='train')
test_dataset = NewsGroupDataset(subset='test')
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

100%|██████████| 336/336 [00:22<00:00, 14.70it/s]
100%|██████████| 18/18 [00:01<00:00, 15.83it/s]


-----------------------------------------------------------
| end of epoch   1 | time: 24.00s | valid accuracy    0.401 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.89it/s]
100%|██████████| 18/18 [00:01<00:00, 15.65it/s]


-----------------------------------------------------------
| end of epoch   2 | time: 22.30s | valid accuracy    0.468 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.90it/s]
100%|██████████| 18/18 [00:01<00:00, 15.49it/s]


-----------------------------------------------------------
| end of epoch   3 | time: 22.30s | valid accuracy    0.712 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.90it/s]
100%|██████████| 18/18 [00:01<00:00, 15.50it/s]


-----------------------------------------------------------
| end of epoch   4 | time: 22.30s | valid accuracy    0.763 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.95it/s]
100%|██████████| 18/18 [00:01<00:00, 15.84it/s]


-----------------------------------------------------------
| end of epoch   5 | time: 22.21s | valid accuracy    0.786 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.84it/s]
100%|██████████| 18/18 [00:01<00:00, 16.00it/s]


-----------------------------------------------------------
| end of epoch   6 | time: 22.34s | valid accuracy    0.813 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.95it/s]
100%|██████████| 18/18 [00:01<00:00, 16.11it/s]


-----------------------------------------------------------
| end of epoch   7 | time: 22.18s | valid accuracy    0.820 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.86it/s]
100%|██████████| 18/18 [00:01<00:00, 15.90it/s]


-----------------------------------------------------------
| end of epoch   8 | time: 22.33s | valid accuracy    0.855 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.95it/s]
100%|██████████| 18/18 [00:01<00:00, 15.94it/s]


-----------------------------------------------------------
| end of epoch   9 | time: 22.20s | valid accuracy    0.857 
-----------------------------------------------------------


100%|██████████| 336/336 [00:20<00:00, 16.04it/s]
100%|██████████| 18/18 [00:01<00:00, 15.92it/s]


-----------------------------------------------------------
| end of epoch  10 | time: 22.08s | valid accuracy    0.848 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.96it/s]
100%|██████████| 18/18 [00:01<00:00, 15.94it/s]


-----------------------------------------------------------
| end of epoch  11 | time: 22.19s | valid accuracy    0.882 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.96it/s]
100%|██████████| 18/18 [00:01<00:00, 15.89it/s]


-----------------------------------------------------------
| end of epoch  12 | time: 22.19s | valid accuracy    0.880 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.93it/s]
100%|██████████| 18/18 [00:01<00:00, 16.15it/s]


-----------------------------------------------------------
| end of epoch  13 | time: 22.21s | valid accuracy    0.882 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.98it/s]
100%|██████████| 18/18 [00:01<00:00, 15.96it/s]


-----------------------------------------------------------
| end of epoch  14 | time: 22.16s | valid accuracy    0.883 
-----------------------------------------------------------


100%|██████████| 336/336 [00:21<00:00, 15.99it/s]
100%|██████████| 18/18 [00:01<00:00, 15.73it/s]

-----------------------------------------------------------
| end of epoch  15 | time: 22.17s | valid accuracy    0.883 
-----------------------------------------------------------





In [9]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.


100%|██████████| 236/236 [00:14<00:00, 16.45it/s]

test accuracy    0.790



