In this notebook:
* have not used pretrained embeddings
* the best hidden size is 90, with accuracy around 88%

# 1.0 Install and import library

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install fire

Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115942 sha256=987db4f4339a2ad6f964f0d2c1be17bdf8678d4dd86f57a219a8dc995b004bd8
  Stored in directory: /root/.cache/pip/wheels/8a/67/fb/2e8a12fa16661b9d5af1f654bd199366799740a85c64981226
Successfully built fire
Installing collected packages: fire
Successfully installed fire-0.4.0


In [None]:
import torch
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange

# 2.0 Prepare dataset into the train, val, test: [text_tensor, label_tensor]

## Read original texts, labels

In [None]:
# Load original texts and labels
df=pd.read_json('All_Beauty_5.json', lines=True)
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
labels = [i for i in df['overall']]

# Classes are imbalanced, so we need to remove some samples. 
new_texts=[]
new_labels=[]
N_5=0
for i in range(len(labels)):
  if labels[i]==5:
    N_5+=1
    if N_5<156:new_texts.append(texts[i]);new_labels.append(labels[i])
  else: new_texts.append(texts[i]);new_labels.append(labels[i])

# Assign new texts and labels as our dataset
texts=new_texts
labels=new_labels

In [None]:
vocab=[]
for i in texts:
  vocab+=i.lower().split(' ')
vocab=set(vocab)
vocab=list(vocab)

In [None]:
texts_index=[]
for i in range(len(texts)):
  texts_index.append([vocab.index(j) for j in texts[i].lower().split(' ')])

## Create a list: `[data_tensor, label_tensor]`

In [None]:
#Next, we need to determine the number of labels in our data. We'll map each of these labels to an index.
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [None]:
#create data list, which can be used for dataloader. 
labels=[label2idx[i] for i in labels]
data=[[i, j]for i,j in zip (texts_index,labels)]
#path="data.pt"
#data=torch.load(path)

In [None]:
data[0]

[[4976, 4408, 122, 3952, 1900, 2628], 4]

## Divide into train, val, test

In [None]:
#divide into train, val, test
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.13, random_state=0)

In [None]:
len(vocab)

5363

# 3.0 Create the model and divide the training process

## Create our model using Class

In [None]:
#create a BERT + Linear model
class NNmodel(torch.nn.Module):
    def __init__(self,features): 
        super().__init__()
        self.embedding = torch.nn.Embedding(num_embeddings=5363, embedding_dim=features)
        self.linear1=torch.nn.Linear(in_features=features,out_features=features,bias=True)
        self.linear2=torch.nn.Linear(in_features=features,out_features=features,bias=True)
        self.linear3=torch.nn.Linear(in_features=features,out_features=features,bias=True)
        self.linear4=torch.nn.Linear(in_features=features,out_features=features,bias=True)
        self.linear5=torch.nn.Linear(in_features=features,out_features=5,bias=True)
        self.LSTM = torch.nn.LSTM(input_size=27, hidden_size=27, batch_first=True) 

    def forward(self, x): #input_ids,[seq] 
        x=self.embedding(x) #seq,20
        x=self.linear1(x) #seq,20
        x=torch.relu(x)  #seq,20
        x=self.linear2(x) #seq,20
        x=torch.relu(x) #seq,20
        x=self.linear3(x) #seq,20
        x=torch.relu(x) #seq,20
        #x=self.LSTM(x)[0][-1] # x：(batch_size, seq_length, hidden_size)
        x=torch.mean(x,0) #1,20
        #x=torch.sum(x)
        #other means
        x=self.linear4(x) #1,20
        x=torch.relu(x) #1,20
        x=self.linear5(x)
        return x #(5)

In [None]:
#create a BERT + Linear model
class LSTMLinear(torch.nn.Module):
    def __init__(self,features): 
        super().__init__()
        self.embedding = torch.nn.Embedding(num_embeddings=5363, embedding_dim=features)
        self.LSTM = torch.nn.LSTM(input_size=features, hidden_size=features, batch_first=True) 
        self.linear=torch.nn.Linear(in_features=features,out_features=5,bias=True)

    def forward(self, x): #input_ids,[seq]
        x=self.embedding(x)
        x,(hidden_state,cell_state) = self.LSTM(x) # x：(batch_size, seq_length, hidden_size)
        x=self.linear(x[-1])
        return x #(5)

## Define the training process

In [None]:
#train our model
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch, val_dataloader,len_val):

    acc_steps = 100
    model.train()

    accumulating_batch_count = 0

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        model.train()
        for i, batch in enumerate(train_dataloader):
            inputs=torch.tensor(batch[0]).to(device) #[m,768]
            logits = model(inputs).unsqueeze(0) #[m,5] 
            targets=batch[1].to(device) #m
            loss = criterion(logits,targets)/batchsize_grad
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            
            accumulating_batch_count += 1
            if accumulating_batch_count % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                #print (round(loss.item(),4))
        
        accuracy=0
        model.eval()
        for i, batch in enumerate(val_dataloader):
            inputs=torch.tensor(batch[0]).to(device) #[m,768]
            with torch.no_grad():
              logits = model(inputs).unsqueeze(0) #[m,5] 
            softmaxed=torch.softmax(logits,-1) #[m,5]
            predict_label=torch.argmax(softmaxed,-1).to('cpu')
            #print (predict_label)
            targets=batch[1].to('cpu') #m
            from sklearn.metrics import accuracy_score
            accuracy+=accuracy_score(targets,predict_label)
        print (accuracy/len(val_dataloader))

# 4.0 Start training:remember to save the model

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
model=NNmodel(27).to(device)

batch_size=1
batchsize_grad=20
epochs=15 #simple model uses more epochs
lr=0.008 #simple models uses larger lr
len_val=len(test_data)
num_batch=round(len(train_data)/batch_size)-1
torch.manual_seed(0)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
criterion=torch.nn.CrossEntropyLoss()
torch.manual_seed(0)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs*len(train_data)/batchsize_grad)

In [None]:
train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, num_batch,val_dataloader, len_val)

Training epoch 0
0.44554455445544555
Training epoch 1
0.44554455445544555
Training epoch 2
0.6732673267326733
Training epoch 3
0.7821782178217822
Training epoch 4
0.8415841584158416
Training epoch 5
0.8415841584158416
Training epoch 6
0.8910891089108911
Training epoch 7
0.8910891089108911
Training epoch 8
0.8811881188118812
Training epoch 9
0.8811881188118812
Training epoch 10
0.8811881188118812
Training epoch 11
0.8811881188118812
Training epoch 12
0.8811881188118812
Training epoch 13
0.8811881188118812
Training epoch 14
0.8811881188118812


In [None]:
#Save our model parameters
N+=1
path="state_dict_model_" + str(N) + ".pt"
torch.save(model.state_dict(), path) 

# 5.0 Validate our model

In [None]:
#path="state_dict_model_3.pt" 
#model.load_state_dict(torch.load(path))

input_data=test_data
val_dataloader = DataLoader(input_data, batch_size=1, shuffle=False)
accuracy=0
model.eval()
for i, batch in enumerate(val_dataloader):
    inputs=torch.tensor(batch[0]).to(device) #[m,768]
    with torch.no_grad():
      logits = model(inputs).unsqueeze(0) #[m,5] 
    softmaxed=torch.softmax(logits,-1) #[m,5]
    predict_label=torch.argmax(softmaxed,-1).to('cpu')
    targets=batch[1].to('cpu') #m
    from sklearn.metrics import accuracy_score
    accuracy+=accuracy_score(targets,predict_label)
print (accuracy/len(input_data))

0.8811881188118812


# 6.0 Baseline model

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(multi_class="ovr", solver="lbfgs"))
])

parameters = {'lr__C': [0.1, 0.5, 1, 2, 5, 10, 100, 1000]}

i=500
best_classifier = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
best_classifier.fit(texts[0:i], labels[0:i])
best_predictions = best_classifier.predict(texts[i:])

baseline_accuracy = np.mean(best_predictions == labels[i:])
print("Baseline accuracy:", baseline_accuracy)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Baseline accuracy: 0.6981818181818182


In [None]:
      # for evaluate the model after certain batches
      if accumulating_batch_count % len(train_dataloader)==0:
          model.eval()
          accuracy=0
          for i, batch in enumerate(val_dataloader):
              inputs=batch[0].to(device) #[m,512]
              with torch.no_grad():
                logits = model(inputs) #[m,5]
              softmaxed=torch.softmax(logits,-1) #[m,5]
              predict_label=torch.argmax(softmaxed,-1).to('cpu')
              targets=batch[1].to('cpu') #m
              from sklearn.metrics import accuracy_score
              accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]

        #print the loss and accuracy of the validation set after each epoch
        print (loss.item(),accuracy/len_val)

        #save the best model
        if accuracy/len_val>0.82: path="best_model.pt"; torch.save(model.state_dict(), path) 

IndentationError: ignored