[Reference website](https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613)

# 1.0 Install and import

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install fire

In [None]:
import torch
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange

# 2.0 Prepare the data 

## Create data_list, label_list

In [None]:
df=pd.read_json('All_Beauty_5.json', lines=True)
df.head(2)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,


In [None]:
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
labels = [i for i in df['overall']]

new_texts=[]
new_labels=[]
N_5=0
for i in range(len(labels)):
  if labels[i]==5:
    N_5+=1
    if N_5<156:new_texts.append(texts[i]);new_labels.append(labels[i])
  else: new_texts.append(texts[i]);new_labels.append(labels[i])
texts=new_texts
labels=new_labels

## Create  [data_tensor,label_tensor]

In [None]:
#Next, we need to determine the number of labels in our data. We'll map each of these labels to an index.
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [None]:
#We need to feed that data into the tokenizer, and the transform them into tensors
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts_tensor=tokenizer(texts,padding=True,max_length=512,truncation=True,return_tensors="pt")['input_ids']
labels_tensor=[torch.tensor(label2idx[i]) for i in labels]
data=[[i, j]for i,j in zip (texts_tensor,labels_tensor)]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Divide into train, val, test

In [None]:
#divide into train, val, test
from sklearn.model_selection import train_test_split
rest_data, test_data = train_test_split(data, test_size=0.1, random_state=1)
train_data, val_data = train_test_split(rest_data, test_size=0.1, random_state=1)

# 3.0 Create the model we need and define the training process

In [None]:
#create a BERT+Linear model
class BertLinear(torch.nn.Module):
    def __init__(self, bertmodel): 
        super().__init__()
        self.bertmodel=bertmodel
        self.linear = torch.nn.Linear(in_features=768, out_features=5)

    def forward(self, x): #input_ids,[m,seq] 
        x=self.bertmodel(x)[1] #[m,768], using the pooled output of BERT
        #for name, param in self.bertmodel.state_dict().items():
        #  if name!="0": param.requires_grad=False
        logits = self.linear(x) # [m,5]
        return logits #(m,5)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertmodel=BertModel.from_pretrained('bert-base-uncased').to(device)
model=BertLinear(bertmodel=bertmodel).to(device)

In [None]:
#train our model
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch, val_dataloader,len_val):
    for epoch in range(epochs):
        print(f"Training epoch {epoch+1}")
        model.train()
        for i, batch in enumerate(train_dataloader):
            model.train()
            inputs=batch[0].to(device) #[m,512]
            logits = model(inputs) #[m,5] 
            targets=batch[1].to(device) #m
            loss = criterion(logits,targets)/batchsize_grad
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            
            if (i+1) % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                print (i-num_batch, loss.item())

        #for evaluate the model after an epoch
        model.eval()
        accuracy=0
        for i, batch in enumerate(val_dataloader):
            inputs=batch[0].to(device) #[m,512]
            with torch.no_grad():
              logits = model(inputs) #[m,5]
            softmaxed=torch.softmax(logits,-1) #[m,5]
            predict_label=torch.argmax(softmaxed,-1).to('cpu')
            targets=batch[1].to('cpu') #m
            from sklearn.metrics import accuracy_score
            accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
        print ("accuracy",accuracy/len_val)

# 4.0 Start training

* 12 epochs, 600 samples, batch_size=10: 720 steps. Validation accuracy=92% \
* In the reference website: 5 epochs, 2500 samples, batch_size=3: 4166 steps

In [None]:
#path="state_dict_model_1.pt"
#model.load_state_dict(torch.load(path))

batch_size=10
batchsize_grad=1
epochs=12
lr=5e-5
len_val=len(val_data)
num_batch=int(len(train_data)/batch_size)+1
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=10, shuffle=True)
criterion=torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs)
train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch,val_dataloader, len_val)

In [None]:
#Save our model parameters
N=0
N+=1
path="state_dict_model_" + str(N) + ".pt"
torch.save(model.state_dict(), path) 

# 5.0 Validate our model

In [None]:
#path="state_dict_model_1.pt" 
#model.load_state_dict(torch.load(path))
model.eval()
val_dataloader = DataLoader(train_data, batch_size=200, shuffle=False)
accuracy=0

for i, batch in enumerate(val_dataloader):
    inputs=batch[0].to(device) #[m,512]
    with torch.no_grad():
      logits = model(inputs) #[m,5]
    softmaxed=torch.softmax(logits,-1) #[m,5]
    predict_label=torch.argmax(softmaxed,-1).to('cpu')
    targets=batch[1].to('cpu') #m
    from sklearn.metrics import accuracy_score
    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
    print (accuracy)
print (accuracy/len(train_data))

# 6.0 Baseline model:logistic regression based on Naive Bayesian statistical features

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(multi_class="ovr", solver="lbfgs"))
])

parameters = {'lr__C': [0.1, 0.5, 1, 2, 5, 10, 100, 1000]}

i=600
best_classifier = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
best_classifier.fit(texts[0:i], labels[0:i])
best_predictions = best_classifier.predict(texts[i:])

baseline_accuracy = np.mean(best_predictions == labels[i:])
print("Baseline accuracy:", baseline_accuracy)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Baseline accuracy: 0.6857142857142857


In [None]:
best_classifier.predict(texts[2000:])

array([5, 5, 5, ..., 5, 5, 5])