# Reference
* Example: [here](https://chriskhanhtran.github.io/posts/cnn-sentence-classification/)

# Import

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F # for use some functions like F.relu(), F.dropout()
import string


# Dataset preparation

## General processing

In [2]:
# For general processing of the data
df=pd.read_json('drive/MyDrive/Colab Notebooks/All_Beauty_5.json', lines=True)
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
labels = [i for i in df['overall']]

new_texts=[]
new_labels=[]
N_5=0
for i in range(len(labels)):
  if labels[i]==5:
    N_5+=1
    if N_5<156:new_texts.append(texts[i]);new_labels.append(labels[i])
  else: new_texts.append(texts[i]);new_labels.append(labels[i])
texts=new_texts
labels=new_labels

#map labels to [0,1,2,3,4]
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
labels=[(label2idx[i]) for i in labels]

## Tokenization for text
* Use NLTK for tokenization

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')

In [4]:
# lowercase, remove punctuation, tokenization 
new_texts=[]
for i in texts:
  new_texts.append(' '.join(w for w in word_tokenize(i.lower()) if w not in string.punctuation))
texts=new_texts

## Embeddings for text

In [5]:
# Create our own vocab because this would make computation faster
text_combine=' '.join(texts)
vocab=set(text_combine.split(' '))
vocab=list(vocab)

In [None]:
# download fasttext/word2vec pretrained embeddings
import os
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

In [None]:
# Create a dictionary of embeddings of our corpus
fname="fastText/crawl-300d-2M.vec"
fin=open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
embedding_dic={}
from tqdm import tqdm_notebook

for line in tqdm_notebook(fin):
  tokens=line.rstrip().split(' ')
  if tokens[0] in vocab: embedding_dic[tokens[0]]=torch.tensor(list(map(float, tokens[1:]))).unsqueeze(0)

In [8]:
# Create the embeddings for our text
text_embedding=[]
for sentence in texts:
  sentence_embedding=torch.zeros(1,300)
  for word in sentence.split(' '):
    if word in [*embedding_dic]: 
      sentence_embedding=torch.cat((sentence_embedding,embedding_dic[word]),0)
  text_embedding.append(sentence_embedding[1:])

## Create the final form and divide

In [9]:
data=[[i, torch.tensor(j)]for i,j in zip (text_embedding,labels)]

In [10]:
# divide
from sklearn.model_selection import train_test_split
rest_data, test_data = train_test_split(data, test_size=0.1, random_state=1)
train_data, val_data = train_test_split(rest_data, test_size=0.1, random_state=1)

# Define our model and train

In [105]:
class CNN_NLP(nn.Module):
    def __init__(self, filter_sizes, num_filters, num_classes):
        super(CNN_NLP, self).__init__()
        self.embed_dim=300
        self.conv1d_list = nn.ModuleList([nn.Conv1d(in_channels=self.embed_dim, out_channels=num_filters[i],kernel_size=filter_sizes[i]) 
                                          for i in range(len(filter_sizes))])
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x_embed): #m,len,300
        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        x_reshaped = x_embed.permute(0, 2, 1)
        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out). A list 
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]
        # Max pooling. Output shape: (b, num_filters[i], 1), a list
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list], dim=1)
        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))
        return logits
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
filter_sizes=[2,3]
num_filters=[20,30]
num_classes=5
model=CNN_NLP(filter_sizes,num_filters,num_classes).to(device)

In [106]:
def train(train_dataloader,model,batchsize_grad,epochs,optimizer,criterion, num_batch, val_dataloader,len_val):
    acc_steps = 100
    model.train()
    accumulating_batch_count = 0
    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        for i, batch in enumerate(train_dataloader):
            model.train()
            inputs=batch[0].to(device) #[m,len,300]
            logits = model(inputs) #[m,5] 
            targets=batch[1].to(device) #m
            loss = criterion(logits,targets)
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            if accumulating_batch_count % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                #scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
            accumulating_batch_count += 1
            # for evaluate the model after certain batches
            if accumulating_batch_count % len(train_dataloader)==0:
                model.eval()
                accuracy=0
                for i, batch in enumerate(val_dataloader):
                    inputs=batch[0].to(device) #[m,512]
                    with torch.no_grad():
                      logits = model(inputs) #[m,5]
                    softmaxed=torch.softmax(logits,-1) #[m,5]
                    predict_label=torch.argmax(softmaxed,-1).to('cpu')
                    targets=batch[1].to('cpu') #m
                    from sklearn.metrics import accuracy_score
                    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
        #print the loss and accuracy of the validation set after each epoch
        print (loss.item(),accuracy/len_val)
        #save the best model
        if accuracy/len_val>0.9: path="best_model.pt"; torch.save(model.state_dict(), path) 


In [None]:
model=CNN_NLP(filter_sizes,num_filters,num_classes).to(device)
batch_size=1
epochs=40 #simple model uses more epochs
lr=0.01 #simple models uses larger lr
#num_batch=round(len(train_data)/batch_size)-1
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(test_data+val_data, batch_size=batch_size, shuffle=True)
len_val=len(val_dataloader)
batchsize_grad=20
criterion=torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)
train(train_dataloader,model,batchsize_grad,epochs,optimizer,criterion, num_batch,val_dataloader, len_val)

# Model test

In [103]:
path="best_model.pt" 
model.load_state_dict(torch.load(path))

val_dataloader = DataLoader(data, batch_size=1, shuffle=True)
model.eval()
accuracy=0
for i, batch in enumerate(val_dataloader):
    inputs=batch[0].to(device) #[m,512]
    with torch.no_grad():
      logits = model(inputs) #[m,5]
    softmaxed=torch.softmax(logits,-1) #[m,5]
    predict_label=torch.argmax(softmaxed,-1).to('cpu')
    targets=batch[1].to('cpu') #m
    from sklearn.metrics import accuracy_score
    accuracy+=accuracy_score(targets,predict_label)*batch[0].shape[0]
print (accuracy/len(val_dataloader))

0.9819354838709677
