# Reference
* Example: [here](https://chriskhanhtran.github.io/posts/cnn-sentence-classification/)

# Import

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F # for use some functions like F.relu(), F.dropout()
import string


# Dataset preparation

## General processing

In [3]:
# For general processing of the data
df=pd.read_json('drive/MyDrive/Colab Notebooks/All_Beauty_5.json', lines=True)
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
labels = [i for i in df['overall']]

new_texts=[]
new_labels=[]
N_5=0
for i in range(len(labels)):
  if labels[i]==5:
    N_5+=1
    if N_5<156:new_texts.append(texts[i]);new_labels.append(labels[i])
  else: new_texts.append(texts[i]);new_labels.append(labels[i])
texts=new_texts
labels=new_labels

#map labels to [0,1,2,3,4]
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
labels=[(label2idx[i]) for i in labels]

## Tokenization for text
* Use NLTK for tokenization

In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# lowercase, remove punctuation, tokenization 
new_texts=[]
for i in texts:
  new_texts.append(' '.join(w for w in word_tokenize(i.lower()) if w not in string.punctuation))
texts=new_texts

## Embeddings for text

In [6]:
# Create our own vocab because this would make computation faster
text_combine=' '.join(texts)
vocab=set(text_combine.split(' '))
vocab=list(vocab)

In [None]:
# download fasttext/word2vec pretrained embeddings
import os
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

In [8]:
# Create a dictionary of embeddings of our corpus
fname="fastText/crawl-300d-2M.vec"
fin=open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
embedding_dic={}
from tqdm import tqdm_notebook

for line in tqdm_notebook(fin):
  tokens=line.rstrip().split(' ')
  if tokens[0] in vocab: embedding_dic[tokens[0]]=torch.tensor(list(map(float, tokens[1:]))).unsqueeze(0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


0it [00:00, ?it/s]

In [14]:
# Create the embeddings for our text
text_embedding=[]
for sentence in texts:
  sentence_embedding=torch.zeros(1,300)
  for word in sentence.split(' '):
    if word in [*embedding_dic]: 
      sentence_embedding=torch.cat((sentence_embedding,embedding_dic[word]),0)
  text_embedding.append(sentence_embedding[1:])

## Create the final form and divide

In [15]:
data=[[i, torch.tensor(j)]for i,j in zip (text_embedding,labels)]

In [16]:
# divide
from sklearn.model_selection import train_test_split
rest_data, test_data = train_test_split(data, test_size=0.1, random_state=1)
train_data, val_data = train_test_split(rest_data, test_size=0.1, random_state=1)

# Define our model and train

* we can use Bi-LSTM or not
* we can concatenate the first vector and the final vector
* we can add together the first vector and the final vector
* we can add the vector at 1/3, 2/3, and the final vector
* we can assign weights to different vectors
* with all the experimentations, best accuracy on test_data+val_data is 0.871, while accuracy on training data is always 1

In [228]:
class TextRNN(torch.nn.Module):
    def __init__(self, hidden_size): #set parameters of GRU and Linear
        super().__init__()
        self.LSTM = torch.nn.LSTM(input_size=300, hidden_size=120, batch_first=True, bidirectional=True) 
        self.linear = torch.nn.Linear(in_features=480, out_features=5)
        self.dropout=torch.nn.Dropout(p=0.3)

    def forward(self, x): #m,seq,300
            x,(hidden_state,cell_state) = self.LSTM(x) # x：(batch_size, seq_length, 100)
            x1=x[:,0]#m,240
            x3=x[:,-1] #m,240
            x=torch.cat((x1,x3),1) #m,480
            x=self.dropout(x) #m,480
            logits = self.linear(x)  # logits：(batch size, 5)
            return logits

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
hidden_size=120
model=TextRNN(hidden_size).to(device)

In [229]:
def train(train_dataloader,model,batchsize_grad,epochs,optimizer,criterion, num_batch, val_dataloader,len_val):
    acc_steps = 100
    model.train()
    accumulating_batch_count = 0
    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        for i, batch in enumerate(train_dataloader):
            model.train()
            inputs=batch[0].to(device) #[m,len,300]
            logits = model(inputs) #[m,5] 
            targets=batch[1].to(device) #m
            loss = criterion(logits,targets)
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            if accumulating_batch_count % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                #scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                print (loss.item())
            accumulating_batch_count += 1
            # for evaluate the model after certain batches
            if accumulating_batch_count % len(train_dataloader)==0:
                model.eval()
                accuracy=0
                loss_val=0
                for i, batch in enumerate(val_dataloader):
                    inputs=batch[0].to(device) #[m,512]
                    with torch.no_grad():
                      logits = model(inputs) #[m,5]
                      targets=batch[1].to('cpu') #m
                      loss_val += criterion(logits,targets)
                    softmaxed=torch.softmax(logits,-1) #[m,5]
                    predict_label=torch.argmax(softmaxed,-1).to('cpu')
                    from sklearn.metrics import accuracy_score
                    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
        #print the loss and accuracy of the validation set after each epoch
        print (loss.item(),accuracy/len_val,loss_val.item()/len_val)
        #save the best model
        if accuracy/len_val>0.85: path="best_model.pt"; torch.save(model.state_dict(), path) 


In [None]:
#model.load_state_dict(torch.load(path))
batch_size=1
epochs=20 #simple model uses more epochs
lr=0.006 #simple models uses larger lr
num_batch=round(len(train_data)/batch_size)-1
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(test_data+val_data, batch_size=batch_size, shuffle=True)
len_val=len(val_dataloader)
batchsize_grad=20
criterion=torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)
train(train_dataloader,model,batchsize_grad,epochs,optimizer,criterion, num_batch,val_dataloader, len_val)

# Model test

In [227]:
#path="best_model.pt" 
#model.load_state_dict(torch.load(path))

val_dataloader = DataLoader(data, batch_size=1, shuffle=True)
model.eval()
accuracy=0
for i, batch in enumerate(val_dataloader):
    inputs=batch[0].to(device) #[m,512]
    with torch.no_grad():
      logits = model(inputs) #[m,5]
    softmaxed=torch.softmax(logits,-1) #[m,5]
    predict_label=torch.argmax(softmaxed,-1).to('cpu')
    targets=batch[1].to('cpu') #m
    from sklearn.metrics import accuracy_score
    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
print (accuracy/len(val_dataloader))

0.9754838709677419
