In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, random, optuna, re, string
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import TPESampler
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast
from sklearn.model_selection import train_test_split
from collections import Counter
from transformers import BertModel, BertTokenizer, AutoModel, AutoTokenizer
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from string import ascii_lowercase
import warnings
warnings.filterwarnings('ignore')

SEED = 15
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7d3cf260f170>

In [None]:
device = torch.device("cuda")

In [None]:
def clean(data):
    data = data.lower() 
    with open('/kaggle/input/datacomments/teencode.txt','r') as file:
      file = file.read()
      lines = file.split('\n')
      for line in lines:
        elements = line.split('\t')
        data = re.sub(r'\b{}+\b'.format(elements[0]), elements[1], data)
    alphabet = 'abcdefghijlmnopqrstuvwxyz'
    for c in alphabet:
      data = re.sub(r'{}+'.format(c), c, data)
    data = re.sub(r'\s+', ' ', data)
    return data

In [None]:
df_train = pd.read_excel('../input/datacomments/train.xlsx')
df_test =  pd.read_excel('../input/datacomments/test.xlsx')
df_valid = pd.read_excel('../input/datacomments/valid.xlsx')


train_texts = df_train['Sentence'].apply(clean)
test_texts = df_test['Sentence'].apply(clean)
valid_texts = df_valid['Sentence'].apply(clean)

y= LabelEncoder()

train_labels = y.fit_transform(df_train['Emotion'])
valid_labels = y.fit_transform(df_valid['Emotion'])
test_labels = y.fit_transform(df_test['Emotion'])

In [None]:
model_name =   'vinai/phobert-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True).cuda()

Downloading (…)lve/main/config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
max_length = 256
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=max_length, return_tensors='pt' )
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=max_length, return_tensors='pt')


# https://huggingface.co/transformers/v3.4.0/custom_datasets.html
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset

train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)
test_dataset = NewsGroupsDataset(test_encodings, test_labels)

train_dataloader = DataLoader(train_dataset,
                              batch_size=8,
                              shuffle=False)

valid_dataloader = DataLoader(valid_dataset,
                              batch_size=8,
                              shuffle=False)

test_dataloader = DataLoader(test_dataset,
                              batch_size=8,
                              shuffle=False)

In [None]:
class RNN(nn.Module):
    
    def __init__(self, num_classes):
        super().__init__()
        num_classes = num_classes
        hidden_size = 128
        dropout = 0.4
        embedding_dim = 768
        num_layers = 1  # Increase the number of layers to 3
        self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True, dropout =dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_size, num_classes)

    def forward(self, x, **kwargs):
        mean_x = torch.mean(x, dim=1, keepdim=True)
        batch_size, num_models, seq_len, hidden_size = mean_x.shape
        x = mean_x.reshape(batch_size*num_models, seq_len, hidden_size)
#         x = x.transpose(1,0)
        
        x, _ = self.rnn(x)
        
        x = F.relu(x)
        x = F.max_pool1d(x.transpose(1, 2), x.size(1)).squeeze(2)
        
        x = self.dropout(x)
        logit = self.fc1(x)
        return logit

In [None]:
def train(model, dataloader, optimizer):

    # reset total loss for epoch
    train_total_loss = 0

    # put both models into traning mode
    model.train()
    RNN_model.train()
    # for each batch of training data...
    for step, batch in enumerate(dataloader):

        b_input_ids = batch['input_ids'].cuda()
        b_input_mask = batch['attention_mask'].cuda()
        b_labels = batch['labels'].cuda().long()
        # clear previously calculated gradients
        optimizer.zero_grad()

        # runs the forward pass with autocasting.
        with autocast():
            # forward propagation (evaluate model on training batch)
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            hidden_layers = outputs['hidden_states']  # get hidden layers

            hidden_layers = torch.stack(hidden_layers, dim=1)  # stack the layers



            hidden_layers = hidden_layers[:, :]

        logits = RNN_model(hidden_layers)
        loss = criterion(logits, b_labels.view(-1))

        train_total_loss += loss.item()

        scaler.scale(loss).backward()

        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

        # Update the scheduler
        scheduler.step()

        # calculate preds
        _, predicted = torch.max(logits, 1)

        # move logits and labels to CPU
        predicted = predicted.detach().cpu().numpy()
        y_true = b_labels.detach().cpu().numpy().ravel()

        # calculate f1


    # calculate the average loss over all of the batches
    avg_train_loss = train_total_loss / len(dataloader)


    # Record all statistics from this epoch.
    training_stats.append(
        {
            'Train Loss': avg_train_loss,
        }
    )
    #torch.cuda.empty_cache()

    return None


def validating(model, dataloader):
    # put both models in evaluation mode
    model.eval()
    RNN_model.eval()
    total_valid_loss = 0
    total_accuracy=0
    # evaluate data for one epoch
    for batch in dataloader:

        b_input_ids = batch['input_ids'].cuda()
        b_input_mask = batch['attention_mask'].cuda()
        b_labels = batch['labels'].cuda().long()

        # tell pytorch not to bother calculating gradients
        with torch.no_grad():
            # forward propagation (evaluate model on training batch)
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)

            hidden_layers = outputs['hidden_states']  # get hidden layers

            hidden_layers = torch.stack(hidden_layers, dim=1)  # stack the layers
            hidden_layers = hidden_layers[:, :]

        logits = RNN_model(hidden_layers)
        loss = criterion(logits, b_labels.view(-1))
        
        _, predicted = torch.max(logits, 1)
      # move logits and labels to CPU
        predicted = predicted.detach().cpu().numpy()
        y_true = b_labels.detach().cpu().numpy().ravel()

        # accumulate validation loss
        total_valid_loss += loss.item()
        total_accuracy += accuracy_score(predicted, y_true)

    avg_val_loss = total_valid_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    # Record all statistics from this epoch.
    valid_stats.append(
        {
            'Val Loss': avg_val_loss,
            'Accuracy': avg_accuracy,
            
        }
    )
    return None


def testing(model, RNN_model,dataloader):
    # put both models in evaluation mode
    y_pred = []

    # evaluate data for one epoch
    for batch in dataloader:

        b_input_ids = batch['input_ids'].cuda()
        b_input_mask = batch['attention_mask'].cuda()
        b_labels = batch['labels'].cuda().long()

        # tell pytorch not to bother calculating gradients
        with torch.no_grad():
            # forward propagation (evaluate model on training batch)
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            hidden_layers = outputs['hidden_states']  # get hidden layers

            hidden_layers = torch.stack(hidden_layers, dim=1)  # stack the layers

            hidden_layers = hidden_layers[:,:]

        logits = RNN_model(hidden_layers)

        # calculate preds
        _, predicted = torch.max(logits, 1)

        # move logits and labels to CPU
        predicted = predicted.detach().cpu().numpy()
        y_true = b_labels.detach().cpu().numpy().ravel()
        
        for i in predicted:
            y_pred.append(i)


    return y_pred

In [None]:
# instantiate CNN
RNN_model = RNN(num_classes=7).cuda()

# set loss
criterion = nn.CrossEntropyLoss()

# set number of epochs
epochs = 5

# optimizer = AdamW(model.parameters(), lr=5e-5)  #weight_decay 
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) 

# set LR scheduler
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# create gradient scaler for mixed precision
scaler = GradScaler()

In [None]:
training_stats = []
valid_stats = []
best_valid_loss = float('inf')
best_valid_acc = 0
timeline = []

# for each epoch
print('Training...')
print("epoch | train loss | val loss | accuracy")
for epoch in range(epochs):
    time_start = time.time()
    # train
    train(model, train_dataloader, optimizer)
    # validate
    validating(model, valid_dataloader)
    print(f"{epoch+1:5d} | {training_stats[epoch]['Train Loss']:<10.5f} | {valid_stats[epoch]['Val Loss']:<8.5f} | {valid_stats[epoch]['Accuracy']:<5.3f}")
    if valid_stats[epoch]['Accuracy'] > best_valid_acc:
        best_valid_acc = valid_stats[epoch]['Accuracy']
        torch.save(RNN_model.state_dict(), 'rnn-model.pt')  # torch save
        torch.save(model.state_dict(), 'bert-rnn-model.pt')
        model.save_pretrained('./model_save/bert-rnn/') 
    time_end = time.time()
    timeline.append(time_end - time_start)

Training...
epoch | train loss | val loss | accuracy
    1 | 1.57742    | 1.35280  | 0.540
    2 | 1.21103    | 1.29414  | 0.561
    3 | 0.98453    | 1.26258  | 0.587
    4 | 0.80168    | 1.21835  | 0.599
    5 | 0.70111    | 1.24444  | 0.582


In [None]:
model.load_state_dict(torch.load('bert-rnn-model.pt'))
RNN_model.load_state_dict(torch.load('rnn-model.pt'))
y_pred = testing(model, RNN_model,test_dataloader)
print(classification_report(test_dataset[:]['labels'].ravel(),y_pred,digits=3)) #3 # 0.558 

              precision    recall  f1-score   support

           0      0.462     0.450     0.456        40
           1      0.589     0.674     0.629       132
           2      0.690     0.705     0.697       193
           3      0.692     0.587     0.635        46
           4      0.578     0.574     0.576       129
           5      0.725     0.638     0.679       116
           6      0.649     0.649     0.649        37

    accuracy                          0.638       693
   macro avg      0.627     0.611     0.617       693
weighted avg      0.641     0.638     0.638       693



In [None]:
timeline

[131.21999382972717,
 130.83610653877258,
 130.3486773967743,
 130.4610631465912,
 128.2597906589508]