# 1.0 Install and import library

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install fire

In [None]:
import torch
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange

# 2.0 Prepare dataset into the train, val, test: [text_tensor, label_tensor]

## Read original texts, labels

In [None]:
# Load original texts and labels
df=pd.read_json('All_Beauty_5.json', lines=True)
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
labels = [i for i in df['overall']]

# Classes are imbalanced, so we need to remove some samples. 
new_texts=[]
new_labels=[]
N_5=0
for i in range(len(labels)):
  if labels[i]==5:
    N_5+=1
    if N_5<156:new_texts.append(texts[i]);new_labels.append(labels[i])
  else: new_texts.append(texts[i]);new_labels.append(labels[i])

# Assign new texts and labels as our dataset
texts=new_texts
labels=new_labels

## Create a list: `[data_tensor, label_tensor]`

In [None]:
#Next, we need to determine the number of labels in our data. We'll map each of these labels to an index.
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [None]:
#Create text tensors. 
#Because we only want to train the linear layers, so we can firstly feed all data into Bert tokenizer, and the transform all the data into tensors.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts_tensor=tokenizer(texts,padding=True, max_length=512, truncation=True, return_tensors="pt")['input_ids']

#Set BERT model, note that: model.eval() and with_no_grad
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertmodel=BertModel.from_pretrained('bert-base-uncased').to(device)
dataloader = DataLoader(texts_tensor, batch_size=1, shuffle=False)
bertmodel.eval()

#Feed the model with our data
logits_bert=torch.zeros(775,768)
for i, batch in enumerate(dataloader):
    inputs=batch[0].to(device) #[m,512]
    with torch.no_grad():
      logits = bertmodel(inputs.unsqueeze(0))[1] #[m,768]
    logits_bert[i]=logits

In [None]:
#create data list, which can be used for dataloader. 
labels_tensor=[torch.tensor(label2idx[i]) for i in labels]
data=[[i, j]for i,j in zip (logits_bert,labels_tensor)]
#path="data.pt"
#data=torch.load(path)

## Divide into train, val, test

In [None]:
#divide into train, val, test
from sklearn.model_selection import train_test_split
rest_data, test_data = train_test_split(data, test_size=0.1, random_state=0)
train_data, val_data = train_test_split(rest_data, test_size=0.1, random_state=0)

# 3.0 Create the model and divide the training process

## Create our model using Class

In [None]:
#create a BERT + Linear model
class BertLinear(torch.nn.Module):
    def __init__(self): 
        super().__init__()
        self.linear1 = torch.nn.Linear(in_features=768, out_features=5, bias=True)
        #self.linear2 = torch.nn.Linear(in_features=1024, out_features=512, bias=True)
        #self.linear3 = torch.nn.Linear(in_features=200, out_features=5, bias=True)

    def forward(self, x): #input_ids,[m,768] 
        x=self.linear1(x)
        #x=torch.nn.functional.relu(self.linear2(x))
        #x=self.linear3(x)
        return x #(m,5)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=BertLinear().to(device)

## Define the training process

In [None]:
#train our model
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch, val_dataloader,len_val):

    acc_steps = 100
    model.train()

    accumulating_batch_count = 0

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        for i, batch in enumerate(train_dataloader):
            model.train()
            inputs=batch[0].to(device) #[m,768]
            logits = model(inputs) #[m,5] 
            targets=batch[1].to(device) #m
            loss = criterion(logits,targets)
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.

            if accumulating_batch_count % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                #if i%50==0: print (i-num_batch, loss.item())
            accumulating_batch_count += 1

            # for evaluate the model after certain batches
            if accumulating_batch_count % len(train_dataloader)==0:
                model.eval()
                accuracy=0
                for i, batch in enumerate(val_dataloader):
                    inputs=batch[0].to(device) #[m,512]
                    with torch.no_grad():
                      logits = model(inputs) #[m,5]
                    softmaxed=torch.softmax(logits,-1) #[m,5]
                    predict_label=torch.argmax(softmaxed,-1).to('cpu')
                    targets=batch[1].to('cpu') #m
                    from sklearn.metrics import accuracy_score
                    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]

        #print the loss and accuracy of the validation set after each epoch
        print (loss.item(),accuracy/len_val)

        #save the best model
        if accuracy/len_val>0.82: path="best_model.pt"; torch.save(model.state_dict(), path) 

# 4.0 Start training:remember to save the model

In [None]:
#path="state_dict_model_1.pt"
#model.load_state_dict(torch.load(path))

batch_size=627
batchsize_grad=1
epochs=4000 #simple model uses more epochs
lr=0.008 #simple models uses larger lr
len_val=len(val_data)
num_batch=round(len(train_data)/batch_size)-1
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
criterion=torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=5000, num_training_steps=-1)
train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch,val_dataloader, len_val)

In [None]:
#Save our model parameters
N+=1
path="state_dict_model_" + str(N) + ".pt"
torch.save(model.state_dict(), path) 

# 5.0 Validate our model

In [None]:
#path="state_dict_model_3.pt" 
#model.load_state_dict(torch.load(path))
#model.eval()

input_data=val_data
val_dataloader = DataLoader(input_data, batch_size=1000, shuffle=False)
accuracy=0

for i, batch in enumerate(val_dataloader):
    inputs=batch[0].to(device) #[m,512]
    with torch.no_grad():
      logits = model(inputs) #[m,5]
    softmaxed=torch.softmax(logits,-1) #[m,5]
    predict_label=torch.argmax(softmaxed,-1).to('cpu')
    targets=batch[1].to('cpu') #m
    from sklearn.metrics import accuracy_score
    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
print (accuracy/len(input_data))

0.9952153110047847


# 6.0 Baseline model

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(multi_class="ovr", solver="lbfgs"))
])

parameters = {'lr__C': [0.1, 0.5, 1, 2, 5, 10, 100, 1000]}

i=500
best_classifier = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
best_classifier.fit(texts[0:i], labels[0:i])
best_predictions = best_classifier.predict(texts[i:])

baseline_accuracy = np.mean(best_predictions == labels[i:])
print("Baseline accuracy:", baseline_accuracy)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Baseline accuracy: 0.6981818181818182
