In [1]:
import torch
from torch import nn
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import MultiStepLR
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from transformers import AutoTokenizer
from utils.data_utils import AG_NEWS_DATASET
from utils.constants import *
from utils.training import Learner
from training_ema import Learner as ema_learner

# from quantization.fully_quantize import Model
from quantization.transformer import Transformer
from quantization.pytorch_api import ModelQuant
from quantization.quantize import quantizer
from quantization.fully_quantize import Model as fullyQuantModel

%load_ext autoreload
%autoreload 2

In [None]:
# load dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dl, test_dl = AG_NEWS_DATASET(tokenizer).load_data()

# create model
original_model = Transformer(4,
                tokenizer.vocab_size,
                BASELINE_MODEL_NUMBER_OF_LAYERS,
                BASELINE_MODEL_NUMBER_OF_HEADS,
                BASELINE_MODEL_DIM)

# model = quantizer(model, 8, True)


model = fullyQuantModel(4,
                tokenizer.vocab_size,
                BASELINE_MODEL_NUMBER_OF_LAYERS,
                BASELINE_MODEL_NUMBER_OF_HEADS,
                BASELINE_MODEL_DIM)

# loss func
loss_fn = nn.CrossEntropyLoss()

# simple optimizer
optim_original = Adam(original_model.parameters(), lr= 1e-4)
scheduler_original = MultiStepLR(optim_original, milestones=[10,15], gamma=0.1)
optim = Adam(model.parameters(), lr= 1e-4)
scheduler = MultiStepLR(optim_original, milestones=[10,15], gamma=0.1)

train_config ={'model_original': original_model,
               'model': model,
               'loss_fn': loss_fn,
               'optim_original': optim_original,
               'optim': optim,
               'datasets': [train_dl, test_dl],
               'epochs': 10,
               'batch_size': BATCH_SIZE,
               'scheduler_original': scheduler_original,
               'scheduler': scheduler,
               'exp_name': "quant_all",
               'epoch_start_quantization': 1
               }

# training
learner_ag_news = ema_learner(train_config)

In [None]:
torch.cuda.empty_cache()
learner_ag_news.train()

In [9]:
# load dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dl, test_dl = AG_NEWS_DATASET(tokenizer, batch_size = BATCH_SIZE).load_data()

# create model
original_model = Transformer(4,
                tokenizer.vocab_size,
                BASELINE_MODEL_NUMBER_OF_LAYERS,
                BASELINE_MODEL_NUMBER_OF_HEADS,
                BASELINE_MODEL_DIM)

model = quantizer(original_model, 2, True)


# loss func
loss_fn = nn.CrossEntropyLoss()

# baseline training config -> do not change!
optim = Adam(model.parameters(), lr= 1e-4)
scheduler = MultiStepLR(optim, milestones=[10,15], gamma=0.1)

train_config = {'model': model, 'loss_fn': loss_fn, 'optim': optim, 'scheduler': scheduler,
                'datasets': [train_dl, test_dl], 'epochs': 10, 'batch_size': BATCH_SIZE,
                'exp_name': 'transformer_quantization_BASELINE_2bit_ALL'}

# train_config['exp_name'] = 'transformer_quantization_PYTORCHAPI_8bit_ALL'

# training
learner_ag_news = Learner(train_config)

  "Lambda function is not supported for pickle, please use "


key bert.embeddings.word_embeddings.weight
shape torch.Size([30522, 512])
key bert.embeddings.position_embeddings.weight
shape torch.Size([512, 512])
key bert.embeddings.token_type_embeddings.weight
shape torch.Size([2, 512])
key bert.embeddings.LayerNorm.weight
shape torch.Size([512])
key bert.embeddings.LayerNorm.bias
shape torch.Size([512])
key bert.encoder.layer.0.attention.self.query.weight
shape torch.Size([512, 512])
key bert.encoder.layer.0.attention.self.query.bias
shape torch.Size([512])
key bert.encoder.layer.0.attention.self.key.weight
shape torch.Size([512, 512])
key bert.encoder.layer.0.attention.self.key.bias
shape torch.Size([512])
key bert.encoder.layer.0.attention.self.value.weight
shape torch.Size([512, 512])
key bert.encoder.layer.0.attention.self.value.bias
shape torch.Size([512])
key bert.encoder.layer.0.attention.output.dense.weight
shape torch.Size([512, 512])
key bert.encoder.layer.0.attention.output.dense.bias
shape torch.Size([512])
key bert.encoder.layer.0.a

In [None]:
torch.cuda.empty_cache()
learner_ag_news.train()