In [1]:
# !pip install --upgrade pip

In [2]:
# pip install -q transformers==3

In [3]:
import pandas as pd
import numpy as np
import random
import json, re
from tqdm.notebook import tqdm
from uuid import uuid4
import tarfile
import io
import os
import sys
import datetime
import argparse

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torchtext.data as data
import torchtext.datasets as datasets
import torch.autograd as autograd
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator #Provides several easy to use Swiss army knife iterators.
from torchtext.vocab import GloVe

## PyTorch Transformer
from transformers import MobileBertTokenizer, MobileBertModel, AdamW, get_linear_schedule_with_warmup, MobileBertForSequenceClassification, MobileBertConfig

import gc
import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data_path = "/content/drive/MyDrive"
output_path = "/content/drive/MyDrive/outputs"

In [5]:
## Importing Datasets

In [6]:
dataset_path = "/content/drive/MyDrive/BalancedQuotesClean.csv"

In [7]:
dataset = pd.read_csv(dataset_path)

In [169]:
# dataset.head()

In [9]:
label_to_ix = {}
for label in dataset.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
# label_to_ix

In [10]:
labels = list(label_to_ix.keys())
num_labels = len(labels)

In [11]:
df = dataset.copy()
# Save preprocessed data, cropped to max length of the model.
df['quote'] = df['quote'].apply(lambda x: " ".join(x.split()[:512]))
df['text'] = df['quote']
del df['quote']
# df['label'] = df['label'].map(label_to_ix)
df.head()

Unnamed: 0,label,text
0,aspirations,To sin offers repentance and forgiveness not t...
1,aspirations,Be calm in arguing for fierceness makes error ...
2,aspirations,For all of its uncertainty we cannot flee the ...
3,aspirations,The way that a handful of corporations in Los ...
4,aspirations,Dreams have only one owner at a time. That's w...


In [12]:
new_path=f"{data_path}/train.csv"
df.to_csv(new_path)

In [159]:
gc.collect()

126252

In [160]:
## Check if Cuda is Available
# Set random seed and set device to GPU.
#choose the same seed to assure that our model will be roproducible
seed_value = 876

def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        device = torch.device('cuda:0')
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
    else:
      device = torch.device('cpu')
    return device

In [161]:
def parse_arguments():
    p = argparse.ArgumentParser(description='Hyperparams for Classifier Training')
    # learning
    p.add_argument('-lr', type=float, default=0.001,
                   help='initial learning rate')
    p.add_argument('-epochs', type=int, default=5,
                   help='number of epochs for train')
    p.add_argument('-batch-size', type=int, default=32,
                   help='batch size for training')
    p.add_argument('-log-interval',  type=int, default=25,
                   help='how many steps to wait before logging')
    p.add_argument('-test-interval', type=int, default=200,
                   help='how many steps to wait before testing')
    p.add_argument('-save-interval', type=int, default=500,
                   help='how many steps to wait before saving')
    p.add_argument('-save-dir', type=str, default='snapshot',
                   help='where to save the snapshot')
    # model
    p.add_argument('-model', type=str, default='ConvText',
                   help='model name')
    p.add_argument('-embed-dim', type=int, default=128,
                   help='word embedding dimensions')
    p.add_argument('-dropout', type=float, default=0.5,
                   help='the probability for dropout')
    # model - LSTM
    p.add_argument('-hidden-dim', type=int, default=128,
                   help='hidden state size')
    p.add_argument('-n-layers', type=int, default=3,
                   help='LSTM layer num')
    p.add_argument('-attention-dim', type=int, default=10,
                   help='attention dimensions')
    # model - CNN
    p.add_argument('-max-norm', type=float, default=3.0,
                   help='l2 constraint of parameters [default: 3.0]')
    p.add_argument('-n-kernel', type=int, default=100,
                   help='number of each kind of kernel')
    p.add_argument('-kernel-sizes', type=str, default='1',
                   help='comma-separated kernel size to use for convolution')
    p.add_argument('-static', action='store_true', default=False,
                   help='fix the embedding')
    # device
    p.add_argument('-device', type=int, default=-1,
                   help='device to use for iterate data, -1 mean cpu')
    # option
    p.add_argument('-snapshot', type=str, default='/content/snapshot/2020-12-21_02-52-26/snapshot_steps13000.pt',
                   help='filename of model snapshot [default: None]')
    p.add_argument('-predict', type=str, default="I saw the best minds of my generation destroyed by madness, starving hysterical naked.",
                   help='predict the sentence given')
    p.add_argument('-test', action='store_true', default=True,
                   help='train or test')
    return p.parse_args("")

In [162]:
# load SST dataset
def sst(TEXT, LABEL, batch_size):
    train, val, test = datasets.SST.splits(TEXT, LABEL, fine_grained=True)
    TEXT.build_vocab(train, val, test)
    LABEL.build_vocab(train, val, test)
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
                                        (train, val, test),
                                        batch_sizes=(batch_size,
                                                     len(val),
                                                     len(test)))
    return train_iter, val_iter, test_iter


# load imdb dataset
def imdb(TEXT, LABEL, batch_size):
    train, test = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train)
    train_iter, test_iter = data.BucketIterator.splits(
            (train, test), batch_size=batch_size,
            shuffle=True, repeat=False)
    print('len(train):', len(train))
    print('len(test):', len(test))
    return train_iter, test_iter




# load custom dataset
def customdata(TEXT, LABEL, batch_size):
    device = seed_all(456)
    # MODEL_NAME = 'google/mobilebert-uncased'
    # tokenizer = MobileBertTokenizer.from_pretrained(MODEL_NAME)

    # # Set tokenizer hyperparameters.
    # MAX_SEQ_LEN = 128
    # BATCH_SIZE = batch_size
    # PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    # UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)


    # Define columns to read.
    # LABEL = Field(sequential=False, use_vocab=False, batch_first=True)
    # TEXT = Field(use_vocab=False, 
    #                   tokenize=tokenizer.encode, 
    #                   include_lengths=False, 
    #                   batch_first=True,
    #                   fix_length=MAX_SEQ_LEN, 
    #                   pad_token=PAD_INDEX, 
    #                   unk_token=UNK_INDEX)

    fields = {'text' : ('text', TEXT), 'label' : ('label', LABEL)}

    # Read preprocessed CSV into TabularDataset and split it into train, test and valid.
    train_data, val_data, test_data = TabularDataset(path=new_path, 
                                                      format='CSV', 
                                                      fields=fields, 
                                                      skip_header=False).split(split_ratio=[0.50, 0.25, 0.25], 
                                                                                stratified=True, 
                                                                                strata_field='label')

    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    # Create train and validation iterators.
    train_iter, val_iter = BucketIterator.splits((train_data, val_data),
                                                  batch_size=batch_size,
                                                  device=device,
                                                  shuffle=True,
                                                  sort_key=lambda x: len(x.text), 
                                                  sort=True, 
                                                  sort_within_batch=False)

    # Test iterator, no shuffling or sorting required.
    test_iter = Iterator(test_data, batch_size=batch_size, device=device, train=False, shuffle=False, sort=False)

    print('len(train):', len(train_data))
    print('len(test):', len(test_data))
    return train_iter, val_iter, test_iter

In [163]:
class AttentionLSTM(nn.Module):
    def __init__(self, args, n_vocab, embed_dim, n_classes, dropout=0.2):
        super(AttentionLSTM, self).__init__()
        print("Building Attention LSTM model...")
        self.n_layers = args.n_layers
        self.hidden_dim = args.hidden_dim
        self.attention_dim = args.attention_dim
        self.v = nn.Parameter(torch.Tensor(self.attention_dim, 1))
        self.m1 = nn.Linear(self.hidden_dim, self.attention_dim)
        self.m2 = nn.Linear(self.hidden_dim, self.attention_dim)

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.lstm = nn.LSTM(embed_dim, self.hidden_dim,
                            num_layers=self.n_layers,
                            dropout=dropout,
                            batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.n = nn.Linear(self.hidden_dim + self.hidden_dim, self.hidden_dim)
        self.output = nn.Linear(self.hidden_dim, n_classes)

    def attention(self, h, h_t, i_size, b_size):
        attention = []
        for i in range(i_size):
            m1 = self.m1(h[:,i,:])  # [b, e] -> [b, a]
            m2 = self.m2(h_t)   # [b, h] -> [b, a]
            a = torch.mm(F.tanh(m1 + m2), self.v)
            attention.append(a)
        attention = F.softmax(torch.stack(attention, 0))  # [i, b, 1]
        context = torch.bmm(h.transpose(1, 2), attention.transpose(0,1))
        return context.squeeze()

    def forward(self, x):
        b_size = x.size()[0]
        i_size = x.size()[1]
        state = self._init_state(b_size)
        x = self.embed(x)  # [b, i, e]
        out, h_t = self.lstm(x, state)  # out: [b, i, h]
        c = self.attention(out, out[:, -1, :], i_size, b_size)
        n = F.tanh(self.n(torch.cat([c, out[:, -1, :]], 1)))
        self.dropout(n)
        logit = self.output(n)
        return logit

    def _init_state(self, b_size=1):
        weight = next(self.parameters()).data
        return (
            Variable(weight.new(self.n_layers, b_size, self.hidden_dim).zero_()),
            Variable(weight.new(self.n_layers, b_size, self.hidden_dim).zero_())
        )

In [164]:
class BasicLSTM(nn.Module):
    """
        Basic LSTM
    """
    def __init__(self, args, n_vocab, embed_dim, n_classes, dropout=0.5):
        super(BasicLSTM, self).__init__()
        print("Building Basic LSTM model...")
        self.n_layers = args.n_layers
        self.hidden_dim = args.hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embed_dim, self.hidden_dim,
                            num_layers=self.n_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        b_size = x.size()[0]
        h_0 = self._init_state(b_size=b_size)
        x = self.embed(x)  #  [b, i] -> [b, i, e]
        x, _ = self.lstm(x, h_0)  # [i, b, h]
        h_t = x[:,-1,:]
        self.dropout(h_t)
        logit = self.out(h_t)  # [b, h] -> [b, o]
        return logit

    def _init_state(self, b_size=1):
        weight = next(self.parameters()).data
        return (
            Variable(weight.new(self.n_layers, b_size, self.hidden_dim).zero_()),
            Variable(weight.new(self.n_layers, b_size, self.hidden_dim).zero_())
        )

In [165]:
class  ConvText(nn.Module):
    """
        Convolutional Neural Networks for Sentence Classification
        https://arxiv.org/abs/1408.5882
    """
    def __init__(self, args, n_vocab, embed_dim, n_classes, dropout=0.5):
        super(ConvText,self).__init__()
        print("Building Conv model...")
        self.args = args
        c_out = args.n_kernel
        kernels = args.kernel_sizes

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, c_out, (k, embed_dim))
                                   for k in kernels])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernels) * c_out, n_classes)

    def forward(self, x):
        x = self.embed(x)   #  [b, i] -> [b, i, e]
        if self.args.static:
            x = Variable(x)
        x = x.unsqueeze(1)  #  [b, c_in, i, e]
        #  [(b, c_out, i), ...] * len(kernels)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        #  [(b, c_out), ...] * len(kernels)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)  # (b, len(kernels) * c_out)
        logit = self.fc(x)   # (b, o)
        return logit

In [166]:
# from classifiers import BasicLSTM, AttentionLSTM, ConvText
# from args import parse_arguments
# from utils import imdb

classifiers = {
    "BasicLSTM": BasicLSTM,
    "AttentionLSTM": AttentionLSTM,
    "ConvText": ConvText
}

def train(model, train_iter, val_iter, args):
    """train model"""
    if args.cuda:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    steps = 0
    model.train()
    for epoch in range(1, args.epochs+1):
        print("\n\nEpoch: ", epoch)
        for batch in train_iter:
            x, y = batch.text, batch.label
            y.data.sub_(1)  # index align
            if args.cuda:
                x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            logit = model(x)
            loss = F.cross_entropy(logit, y)
            loss.backward()
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                corrects = (torch.max(logit, 1)
                            [1].view(y.size()).data == y.data).sum()
                accuracy = 100.0 * corrects/batch.batch_size
                sys.stdout.write(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(
                    steps, loss.data.item(), accuracy, corrects, batch.batch_size))
            if steps % args.test_interval == 0:
                evaluate(model, val_iter, args)
            if steps % args.save_interval == 0:
                if not os.path.isdir(args.save_dir):
                    os.makedirs(args.save_dir)
                save_prefix = os.path.join(args.save_dir, 'snapshot')
                save_path = '{}_steps{}.pt'.format(save_prefix, steps)
                torch.save(model, save_path)

def evaluate(model, val_iter, args):
    """evaluate model"""
    model.eval()
    corrects, avg_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text, batch.label
        y.data.sub_(1)  # index align
        if args.cuda:
            x, y = x.cuda(), y.cuda()
        logit = model(x)
        loss = F.cross_entropy(logit, y, size_average=False)
        avg_loss += loss.data.item()
        corrects += (torch.max(logit, 1)
                     [1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = avg_loss / size
    accuracy = 100.0 * corrects / size
    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(
          avg_loss, accuracy, corrects, size))
    model.train() # return to training mode

def predict(model, text, TEXT, LABEL):
    device = seed_all(456)
    MODEL_NAME = 'google/mobilebert-uncased'
    tokenizer = MobileBertTokenizer.from_pretrained(MODEL_NAME)
    
    # from torch import Tensor
    """predict"""
    assert isinstance(text, str)
    model.eval()
    
    
    encoded_review = tokenizer.encode_plus(
    text,
    max_length=128,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=False,
    return_attention_mask=True,
    return_tensors='pt')
    x = encoded_review['input_ids'].to(device)


    # text = TEXT.tokenize(text)
    # text = TEXT.preprocess(text)
    # text = [[TEXT.vocab.stoi[x] for x in text]]
    # x = torch.LongTensor(text).to(device)
    # x.type(torch.DoubleTensor)
    # x = TEXT.tensor_type(text)
    # x = autograd.Variable(x, volatile=True)
    # print(x)
    output = model(x)
    _, predicted = torch.max(output, 1)
    return LABEL.vocab.itos[predicted.data.item()]

def main1():
    # get hyper parameters
    args = parse_arguments()

    # load data
    print("\nLoading data...")
    TEXT = data.Field(lower=True, batch_first=True)
    LABEL = data.Field(sequential=False)
    train_iter, val_iter, test_iter = customdata(TEXT, LABEL, args.batch_size)

    # update args
    args.n_vocab = n_vocab = len(TEXT.vocab)
    args.n_classes = n_classes = len(LABEL.vocab) - 1
    args.cuda = torch.cuda.is_available()
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
    args.save_dir = os.path.join(args.save_dir,
            datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

    # print args
    print("\nParameters:")
    for attr, value in sorted(args.__dict__.items()):
        print("\t{}={}".format(attr.upper(), value))
    
    return args, train_iter, val_iter, test_iter, n_vocab, n_classes, TEXT, LABEL

def main2(args, train_iter, val_iter, test_iter, n_vocab, n_classes, TEXT, LABEL):
    # initialize/load the model
    if args.snapshot is None:
        classifier = classifiers[args.model]
        classifier = classifier(args, n_vocab, args.embed_dim, n_classes, args.dropout)
    else :
        print('\nLoading model from [%s]...' % args.snapshot)
        try:
            classifier = torch.load(args.snapshot)
        except :
            print("Sorry, This snapshot doesn't exist."); exit()
    if args.cuda:
        classifier = classifier.cuda()

    # train, test, or predict
    if args.predict is not None:
        label = predict(classifier, args.predict, TEXT, LABEL)
        print('\n[Text]  {}[Label] {}\n'.format(args.predict, label))
    elif args.test :
        try:
            evaluate(classifier, test_iter, args)
        except Exception as e:
            print("\nSorry. The test dataset doesn't  exist.\n")
            print(e)
    else :
        print()
        train(classifier, train_iter, val_iter, args)

In [167]:
args, train_iter, val_iter, test_iter, n_vocab, n_classes, TEXT, LABEL = main1()


Loading data...
len(train): 21015
len(test): 10510

Parameters:
	ATTENTION_DIM=10
	BATCH_SIZE=32
	CUDA=True
	DEVICE=-1
	DROPOUT=0.5
	EMBED_DIM=128
	EPOCHS=5
	HIDDEN_DIM=128
	KERNEL_SIZES=[1]
	LOG_INTERVAL=25
	LR=0.001
	MAX_NORM=3.0
	MODEL=ConvText
	N_CLASSES=5
	N_KERNEL=100
	N_LAYERS=3
	N_VOCAB=24294
	PREDICT=I saw the best minds of my generation destroyed by madness, starving hysterical naked.
	SAVE_DIR=snapshot/2020-12-21_03-22-00
	SAVE_INTERVAL=500
	SNAPSHOT=/content/snapshot/2020-12-21_02-52-26/snapshot_steps13000.pt
	STATIC=False
	TEST=True
	TEST_INTERVAL=200


In [168]:
main2(args, train_iter, val_iter, test_iter, n_vocab, n_classes, TEXT, LABEL)


Loading model from [/content/snapshot/2020-12-21_02-52-26/snapshot_steps13000.pt]...

[Text]  I saw the best minds of my generation destroyed by madness, starving hysterical naked.[Label] aspirations



Epoch:  20
Batch[12600] - loss: 0.098719  acc: 100.0000%(32/32)
Evaluation - loss: 0.393841  acc: 86.9396%(9133/10505) 

Batch[12800] - loss: 0.400348  acc: 84.3750%(27/32)
Evaluation - loss: 0.401343  acc: 86.6159%(9099/10505) 

Batch[13000] - loss: 0.271296  acc: 90.6250%(29/32)
Evaluation - loss: 0.407512  acc: 86.7301%(9111/10505) 

Batch[13125] - loss: 0.096005  acc: 100.0000%(32/32)