In [11]:
# !pip install jupyter ipywidgets allennlp_models textattack[tensorflow]





In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""
import os.path
import torch
import torch.optim as optim
# from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import \
#     StanfordSentimentTreeBankDatasetReader
from reader_new import StanfordSentimentTreeBankDatasetReader_NEW
# from allennlp.data.iterators import BucketIterator, BasicIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.embedding import _read_pretrained_embeddings_file
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.trainer import Trainer
from allennlp.common.util import lazy_groups_of
from allennlp.data.token_indexers import SingleIdTokenIndexer
import pandas as pd

2022-05-12 14:19:03.577130: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
class LstmClassifier(Model):
    def __init__(self, word_embeddings, encoder, vocab):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, tokens, label):
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)
        return output

    def get_metrics(self, reset=False):
        return {'accuracy': self.accuracy.get_metric(reset)}

In [4]:
# load the binary SST dataset.
single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer

# use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
# reader = StanfordSentimentTreeBankDatasetReader_NEW(granularity="2-class",
#                                                 token_indexers={"tokens": single_id_indexer},
#                                                 use_subtrees=True)
# train_data = reader.read('./data/train.txt')
#     print(train_data)
reader = StanfordSentimentTreeBankDatasetReader_NEW(granularity="2-class",
                                                token_indexers={"tokens": single_id_indexer})
dev_data = reader.read('./data/dev.txt')

In [5]:
vocab_path = "./lstm_main_sst_model/w2v_" + "vocab"
vocab = Vocabulary.from_files(vocab_path)

In [6]:
# vocab = Vocabulary.from_instances(train_data)
# print(vocab)

In [7]:
embedding_path = "./data/crawl-300d-2M.vec.zip"
weight = _read_pretrained_embeddings_file(embedding_path,
                                          embedding_dim=300,
                                          vocab=vocab,
                                          namespace="tokens")
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=300,
                            weight=weight,
                            trainable=False)
word_embedding_dim = 300
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})


encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                              hidden_size=512,
                                              num_layers=2,
                                              batch_first=True))

model = LstmClassifier(word_embeddings, encoder, vocab)
model_path = "./lstm_main_sst_model/w2v_model.th"
    
with open(model_path, 'rb') as f:
    model.load_state_dict(torch.load(f,map_location='cpu'))

  0%|          | 0/1999995 [00:00<?, ?it/s]

In [7]:
# embedding_path = "./data/crawl-300d-2M.vec.zip"
# weight = _read_pretrained_embeddings_file(embedding_path,
#                                           embedding_dim=300,
#                                           vocab=vocab,
#                                           namespace="tokens")
# token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
#                             embedding_dim=300,
#                             weight=weight,
#                             trainable=False)
# word_embedding_dim = 300
# word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})


# encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
#                                               hidden_size=512,
#                                               num_layers=2,
#                                               batch_first=True))

# # Initialize model, cuda(), and optimizer
# model = LstmClassifier(word_embeddings, encoder, vocab)
# model.cuda()

In [9]:
iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

iterator.index_with(vocab)

optimizer = optim.Adam(model.parameters())

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_data,
                  validation_dataset=dev_data,
                  num_epochs=4,
                  patience=1,
                  cuda_device=0)

In [None]:
trainer.train()

In [None]:
# where to save the model
model_path = "lstm_main_sst_model/w2v_model.th"
vocab_path = "lstm_main_sst_model/w2v_vocab"

with open(model_path, 'wb') as f:
    torch.save(model.state_dict(), f)
vocab.save_to_files(vocab_path)

In [10]:
print(model)

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(300, 512, num_layers=2, batch_first=True)
  )
  (linear): Linear(in_features=512, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
)


In [19]:
state = torch.load('lstm_main_sst_model/w2v_model.th')
model.load_state_dict(state)

<All keys matched successfully>

In [8]:
import torch.quantization
import torch.nn as nn

In [9]:
model.cpu()
quantized_model = torch.quantization.quantize_dynamic(
    model,{nn.Linear,nn.LSTM}, dtype=torch.qint8
)

In [10]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (MB):', size/1e6)
    os.remove('temp.p')
    return size

In [11]:
model_size = print_size_of_model(model,"fp32")
quantized_model_size = print_size_of_model(quantized_model,"int8")

print("{0:.2f} times smaller".format(model_size/quantized_model_size))

model:  fp32  	 Size (MB): 34.727044
model:  int8  	 Size (MB): 23.44499
1.48 times smaller


In [12]:
quantized_model

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): DynamicQuantizedLSTM(300, 512, num_layers=2, batch_first=True)
  )
  (linear): DynamicQuantizedLinear(in_features=512, out_features=2, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
  (loss_function): CrossEntropyLoss()
)