In [41]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!rm glove.6B.zip
!tar -xzf /content/drive/MyDrive/NLP_Training/yelp_review_full_csv.tar.gz

In [99]:
import torch
from torch import nn
from torch.nn import GRU
import torch.nn.functional as F
from torch.utils.data import DataLoader , Dataset
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
import os
import numpy as np
import re
from tqdm import tqdm
import pandas as pd

In [179]:


class WordAttn(nn.Module):
  def __init__(self , embed_size :int  , hidden_size : int ):
    super(WordAttn, self).__init__()
    self.gru = GRU(input_size = embed_size , hidden_size = hidden_size , bidirectional=True)
    self.MLP = nn.Linear(2*hidden_size , 2*hidden_size , bias = True)
    self.word_context = nn.Parameter(torch.Tensor(2 * hidden_size, 1))
  def forward(self , x):
    x , _ = self.gru(x.float())
    u_it = self.MLP(x)
    u_it = torch.matmul(u_it , self.word_context)
    u_it = F.tanh(u_it)
    out = F.softmax(u_it , dim = 0)
    out = torch.mul(out , x)
    out = torch.sum(out, dim = 0)
    return out


class SentenceAttn(nn.Module):
  def __init__(self , embed_size :int  , hidden_size : int):
    super(SentenceAttn , self).__init__()
    self.gru = GRU(input_size = embed_size , hidden_size = hidden_size , bidirectional=True)
    self.MLP = nn.Linear(2*hidden_size , 2*hidden_size , bias = True)
    self.word_context = nn.Parameter(torch.Tensor(2 * hidden_size, 1))
  def forward(self , x):
    x , _ = self.gru(x.float())
    u_it = self.MLP(x)
    u_it = torch.matmul(u_it , self.word_context)
    out = F.softmax(u_it , dim = 0)
    out = torch.mul(out , x)


    out = torch.sum(out, dim = 0)

    return out

class MyModel(nn.Module):
  def __init__(self , embed_size : int , hidden_size : int):
    super(MyModel , self).__init__()
    self.WordAttn = WordAttn(embed_size , hidden_size)
    self.SentenceAttn = SentenceAttn(hidden_size *2 , hidden_size)
    self.MLP = nn.Linear(hidden_size*2 , 5)
  def forward(self , input):
    out_sent = []
    input = input.permute(1, 0, 2 , 3)

    for i in input :
      out = self.WordAttn(i.permute(1, 0 , 2))
      out_sent.append(out)
    out_sent = torch.stack(out_sent )


    out_sent = self.SentenceAttn(out_sent)
    out_sent = self.MLP(out_sent)
    out_sent = F.softmax(out_sent)
    return out_sent

In [96]:

class Mydataset(Dataset):
  def __init__(self , data  , label):
    self.data = data
    self.label = label
  def __len__(self):
    return len(self.data)
  def __getitem__(self, idx):
    dt = {"data": self.data[idx] , 'y' : self.label[idx]}
    return dt


In [103]:

embed_size = 50
max_len_sen = 5
max_word = 5
def load_glove(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array([float(x) for x in line[1:]])
    return words, word_to_vec_map
def split_text_to_sentences(text):
    # Sử dụng biểu thức chính quy để tách văn bản thành các câu
    sentences = re.split(r'[.!?]+', text)

    # Loại bỏ các câu trống
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    return sentences
def split_sentence_to_words(sentence):
    # Thay thế các dấu phẩy bằng dấu cách
    sentence = sentence.replace(',', ' , ')

    # Tách câu thành các từ bằng cách sử dụng phương thức split()
    words = sentence.split()

    return words
def get_data(word_to_vec_map , root_path : str  ):
  train_csv_path = os.path.join(root_path , "train.csv")
  test_csv_path = os.path.join(root_path , 'test.csv')
  dt_train = pd.read_csv(train_csv_path ,nrows=50000).values
  dt_test = pd.read_csv(test_csv_path).values
  embed_train = []
  label_train = []
  for x in tqdm(dt_train,desc = "train_process"):
    label = x[0]
    sentences = x[1].strip()
    sentences = split_text_to_sentences(sentences)
    encode = []
    for st in sentences:
        sen2vec = []
        sen2vec = [word_to_vec_map[word.lower()] if word in word_to_vec_map else np.zeros((embed_size))  for word in split_sentence_to_words(st)]
        if(len(sen2vec) > max_word) :
          sen2vec = sen2vec[0:max_word]
        else :
          pad = [np.zeros((embed_size)) for _ in range(max_word - len(sen2vec) )]
          sen2vec.extend(pad)
        sen2vec = np.stack(sen2vec)
        encode.append(sen2vec)
    if(len(encode) > max_len_sen) :
      encode = encode[0:max_len_sen]
    else :
      pad = [np.zeros((max_word ,embed_size )) for _ in range(max_len_sen - len(encode))]
      encode.extend(pad)
    try:
      encode = np.stack(encode)
    except:
      continue
    embed_train.append(encode)
    label_train.append(label)



  embed_test = []
  label_test = []
  for x in tqdm(dt_test , desc = " test_process"):
    label = x[0]
    sentences = x[1].strip()
    sentences = split_text_to_sentences(sentences)
    encode = []
    for st in sentences:
        sen2vec = []
        sen2vec = [word_to_vec_map[word.lower()] if word in word_to_vec_map else np.zeros((embed_size))  for word in split_sentence_to_words(st)]
        if(len(sen2vec) > max_word) :
          sen2vec = sen2vec[0:max_word]
        else :
          pad = [np.zeros((embed_size)) for _ in range(max_word - len(sen2vec))]
          sen2vec.extend(pad)
        sen2vec = np.stack(sen2vec)
        encode.append(sen2vec)
    if(len(encode) > max_len_sen) :
      encode = encode[0:max_len_sen]
    else :
      pad = [np.zeros((max_word ,embed_size )) for _ in range(max_len_sen - len(encode))]
      encode.extend(pad)
    try:
      encode = np.stack(encode)
    except:
      continue
    embed_test.append(encode)
    label_test.append(label)
  return embed_train ,label_train ,  embed_test , label_test



In [36]:
words, word_to_vec_map = load_glove("/content/glove.6B.50d.txt")

In [104]:
embed_train ,label_train ,  embed_test , label_test = get_data( word_to_vec_map , '/content/yelp_review_full_csv' )

train_process: 100%|██████████| 50000/50000 [00:16<00:00, 3029.50it/s]
 test_process: 100%|██████████| 49999/49999 [00:21<00:00, 2321.56it/s]


In [106]:
train_set = Mydataset(embed_train , label_train)
Train_loader = DataLoader(train_set , batch_size = 32 , shuffle = True)

In [None]:
num_epochs = 5
model = MyModel(50 , 128)
loss_fc = CrossEntropyLoss(reduce=True)
opt = Adam(model.parameters() , lr = 0.001 , weight_decay = 0)
for epoch in range(num_epochs):
  pbar = tqdm(Train_loader , desc="training")

  total_loss = 0
  model.train()
  for i , data in enumerate(pbar):
      opt.zero_grad()
      out = model(data['data'])
      out.cpu()
      loss = loss_fc(out.squeeze() , torch.nn.functional.one_hot(data['y']-1, 5).to(torch.float))
      total_loss += loss.item()
      loss.backward()
      opt.step()
      pbar.set_description("Epoch: {}, Loss: {:4f}".format(epoch + 1, total_loss/(i+1)))
