# Load the dataset

In [None]:
!pip install transformers
!pip install evaluate
from transformers import AutoModel, AutoTokenizer, AutoConfig
import torch
from torch.utils.data import DataLoader

## The dataset statistics
* 5269 samples
* Number of labels: 1:115; 2:64; 3:109; 4:332; 5:4649. We should add weights to the crossentropy of the training set.

In [None]:
#load dataset
import pandas as pd
df=pd.read_json('All_Beauty_5.json', lines=True)
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
original_labels = [i for i in df['overall']]

#Next, we need to determine the number of labels in our data. We'll map each of these labels to an index.
target_names = list(set(original_labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}

#We need to feed that data into the tokenizer, and the transform them into tensors
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

texts_tensor=[]
for i in texts:
  i=tokenizer(i,return_tensors="pt", max_length=512, truncation=True)['input_ids']
  texts_tensor.append(i)
labels=[label2idx[i] for i in original_labels]

In [4]:
# Divide the dataset
#divide into train, val, test
from sklearn.model_selection import train_test_split
train_tensor, test_tensor, train_labels, test_labels = train_test_split(texts_tensor, labels , test_size=0.15, random_state=1)
train_data=[[i,torch.tensor(j)] for i,j in zip(train_tensor,train_labels)]
test_data=[[i,torch.tensor(j)] for i,j in zip(test_tensor,test_labels)]
data=train_data+test_data

## Train data statistics
* number of samples: 4478
* number of labels: 97, 54, 92, 285, 3950
* proportion of labels: 2.1, 1.2, 2, 6, 88
* weights for category: 46, 83, 49, 16, 1

# Define student model for distillation
* Thoughts on designing student model:
*1. To save time, I do not want to finetune the BERT model
*2. I want the student model to mimic the behavior of the BERT model 
*3. There are several values to mimic: (1) logits of the final layer; (2) logits of several skip layers; (3) attention matrix; (4) value-value relation of the final layer; (5) cosine similarity of the softmax; (6) while distilling, consider add MLM task to the student model (trivial); (7) output softmax/logits of a specific task;
*4. For simplicity, I distill (1) the logits of the final layer and (2) several layers, and (3) without distillation and then compare their behavior.



In [None]:
#load the teacher model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher=AutoModel.from_pretrained('bert-base-uncased',output_hidden_states=True).to(device)

#now, we want the layers to be 4
config=AutoConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
config.num_hidden_layers=4

#config the model with layer=4
model = AutoModel.from_pretrained('bert-base-uncased', config=config).to(device)  # auto skip unused layers

# now we want to initialize the parameters to be original layers [0, 4, 7, 10], "skip idea"
layers=[0, 4, 7, 10]
for i in range(4):
    model.base_model.encoder.layer[i] = teacher.base_model.encoder.layer[layers[i]]

# Student model training strategy 1 training process and hyperparameter initialization

In [182]:
# train our model, mimic the final layer logits
# In this training, we want to the student model to even overfit the teacher model, so we do not set the model.train()
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch):
    for epoch in range(epochs):
        print(f"Training epoch {epoch+1}")
        loss_accumulate=0
        for i, batch in enumerate(train_dataloader):
            inputs = batch[0].squeeze(0).to(device) #[m,512]
            logits = model(inputs)[0] #[m,5] 
            targets= teacher(inputs)[0] #m
            loss = criterion(logits,targets)/batchsize_grad
            loss_accumulate+=loss.item()
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            
            if (i+1) % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                #scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                print (loss_accumulate)
                loss_accumulate=0

In [135]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange
batch_size=1
batchsize_grad=8
epochs=1
lr=5e-5
num_batch=int(len(data)/batch_size)+1
torch.manual_seed(0)
train_dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
criterion=torch.nn.MSELoss(reduction='mean')
torch.manual_seed(0)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs)
scheduler=None

In [None]:
# Train 316 steps
train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch)

# Student model training strategy 2 training process and hyperparameter initialization

* We force the outputs of every layer to mimic the teacher model. 

In [6]:
# train our model, mimic the final layer logits
# In this training, we want to the student model to even overfit the teacher model, so we do not set the model.train()
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch):
    for epoch in range(epochs):
        print(f"Training epoch {epoch+1}")
        loss_accumulate=0
        for i, batch in enumerate(train_dataloader):
            inputs = batch[0].squeeze(0).to(device) #[m,512]
            logits = model(inputs) #[m,5] 
            targets = teacher(inputs) #m
            loss1 = criterion(logits[0],targets[0]) #output layer
            loss2 = criterion(logits[2][1],targets[2][1]) #1st layer
            loss3 = criterion(logits[2][2],targets[2][5]) #2nd layer
            loss4 = criterion(logits[2][3],targets[2][8]) #3nd layer
            loss=(loss1+loss2+loss3+loss4)/4/batchsize_grad
            loss_accumulate+=loss.item()
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            if (i+1) % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                #scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                print (i+1, loss_accumulate)
                loss_accumulate=0
            if i+2 == 2529: return 'finish training' 

In [7]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange
batch_size=1
batchsize_grad=8
epochs=1
lr=5e-5
num_batch=int(len(data)/batch_size)+1
torch.manual_seed(0)
train_dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
criterion=torch.nn.MSELoss(reduction='mean')
torch.manual_seed(0)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs)
scheduler=None

In [8]:
# Train 316 steps
train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch)

Training epoch 1
8 0.27703865990042686
16 0.2574931662529707
24 0.23059661500155926
32 0.1960786897689104
40 0.1836695373058319
48 0.18020579777657986
56 0.17036115378141403
64 0.1645518783479929
72 0.15668603591620922
80 0.14586523547768593
88 0.140483851544559
96 0.1345058148726821
104 0.14169701095670462
112 0.13230726029723883
120 0.10801821760833263
128 0.11074337922036648
136 0.10426778253167868
144 0.08749044500291348
152 0.10336858965456486
160 0.08897718321532011
168 0.08065301645547152
176 0.08372785802930593
184 0.07825937308371067
192 0.07347073871642351
200 0.06771460920572281
208 0.06302279140800238
216 0.06019698875024915
224 0.056106350384652615
232 0.05015338538214564
240 0.04993952717632055
248 0.04918390139937401
256 0.04726535640656948
264 0.048852693289518356
272 0.04090201808139682
280 0.040910341776907444
288 0.04241461306810379
296 0.035019478760659695
304 0.03433313174173236
312 0.03345872391946614
320 0.029013865860179067
328 0.02918864064849913
336 0.02684640

'finish training'

# Student model classification finetune

## Define classification model

In [18]:
#create a student+Linear model
class BaseLinear(torch.nn.Module):
    def __init__(self, basemodel): 
        super().__init__()
        self.basemodel=basemodel
        self.linear = torch.nn.Linear(in_features=768, out_features=5)

    def forward(self, x): 
        x=self.basemodel(x)[1] 
        #for name, param in self.basemodel.state_dict().items():
        #  if name!="0": param.requires_grad=False
        logits = self.linear(x) # [m,5]
        return logits #(m,5)
basemodel=model.to(device)
torch.manual_seed(0)
final_model=BaseLinear(basemodel=basemodel).to(device)

## Define training process and hyperparameter init

In [19]:
#train our model
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, val_dataloader,len_val):
    for epoch in range(epochs):
        print(f"Training epoch {epoch+1}")
        model.train()
        loss_accumulate=0
        for i, batch in enumerate(train_dataloader):
            model.train()
            inputs=batch[0].squeeze(0).to(device) #[m,512]
            logits = model(inputs).view(1,-1) #[m,5]
            targets=batch[1].to(device) #m
            loss = criterion(logits,targets)/batchsize_grad
            loss_accumulate+=loss
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            
            if (i+1) % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                print (i+1, loss_accumulate)
                loss_accumulate=0
                

        #for evaluate the model after an epoch
        model.eval()
        accuracy=0
        for i, batch in enumerate(val_dataloader):
            inputs=batch[0].squeeze(0).to(device) #[m,512]
            with torch.no_grad():
              logits = model(inputs) #[m,5]
            softmaxed=torch.softmax(logits,-1) #[m,5]
            predict_label=torch.argmax(softmaxed,-1).to('cpu')
            targets=batch[1].to('cpu') #m
            from sklearn.metrics import accuracy_score
            accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
        print ("accuracy",accuracy/len_val)

In [None]:
#path="state_dict_model_1.pt"
#model.load_state_dict(torch.load(path))
from transformers import AdamW, get_linear_schedule_with_warmup

batch_size=1
batchsize_grad=5
epochs=1
lr=5e-5
len_val=len(test_data)
torch.manual_seed(0)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(test_data, batch_size=1, shuffle=True)
criterion=torch.nn.CrossEntropyLoss(weight=torch.tensor([46, 83, 49, 16, 1],dtype=torch.float))
torch.manual_seed(0)
optimizer = AdamW(final_model.parameters(), lr=lr)
torch.manual_seed(0)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs/batchsize_grad)

In [None]:
train(train_dataloader,final_model,batchsize_grad,epochs,scheduler,optimizer,criterion,val_dataloader, len_val)

* For training strategy 1, after training 1 epoch, the test accuracy is 92.67, the train accuracy is 94.5. There might be improvement space if it is trained with more epochs.
* For training strategy 2, after training 1 epoch, the test accuracy is 92.79, the train accuracy is 93.39
* For no distillation, after training 1 epoch, the test accuracy is 94.31, the train accuracy is 97. 

In [None]:
#for evaluate the final_model after an epoch
model.eval()
accuracy=0
val_dataloader = DataLoader(train_data, batch_size=1, shuffle=True)

for i, batch in enumerate(val_dataloader):
    inputs=batch[0].squeeze(0).to(device) #[m,512]
    with torch.no_grad():
      logits = final_model(inputs) #[m,5]
    softmaxed=torch.softmax(logits,-1) #[m,5]
    predict_label=torch.argmax(softmaxed,-1).to('cpu')
    targets=batch[1].to('cpu') #m
    from sklearn.metrics import accuracy_score
    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
print ("accuracy",accuracy/len(val_dataloader))