In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW, lr_scheduler
from transformers import AutoModel, AutoTokenizer, AutoConfig
from sklearn.model_selection import train_test_split
from collections import defaultdict
import time
import os
from tqdm import tqdm
from copy import deepcopy
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

In [None]:
import torch

print(torch.__version__)

In [None]:
CONFIG = {
    'batch_size': 2,
    'epochs': 5,
    'lr': 8e-6,
    'n_accumulate': 8,
#     'pretrained_model_path': '../input/deberta-base/mlm_base/mlm_base/Deberta_large-v3-itpt-e3',
#     'pretrained_config_path': '../input/deberta-base/mlm_base/mlm_base/model_tokenizer',
#     'model_name': '../input/deberta-v3-large',
    'model_name': 'microsoft/deberta-v3-large',
    'weight_decay': 1e-4,
    'seed': 2022,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'patience': 10,
    'num_cycles':0.5,
    'scheduler':'cosine', # ['linear', 'cosine']
    'batch_scheduler':True,
    'num_warmup_steps':0,
    'folds':[3],
    'saved_model_path': "saved_model_state_deberta.pt"
}

In [None]:
data = pd.read_csv('../input/feedback-stratified-folds-disctype-eff/feedback_train_folds.csv')
test_data = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')

def fetch_essay_texts(df, train=True):
    if train:
        base_path = '../input/feedback-prize-effectiveness/train/'
    else:
        base_path = '../input/feedback-prize-effectiveness/test/'
        
    essay_texts = {}
    for filename in os.listdir(base_path):
        with open(base_path + filename) as f:
            text = f.readlines()
            full_text = ' '.join([x for x in text])
            essay_text = ' '.join([x for x in full_text.split()])
        essay_texts[filename[:-4]] = essay_text
    df['essay_text'] = [essay_texts[essay_id] for essay_id in df['essay_id'].values]   
    return df
    
data = fetch_essay_texts(data)
data.head()

In [None]:
label_dict = {'Effective': 0, 'Adequate': 1, 'Ineffective': 2}
data['discourse_effectiveness'].replace(label_dict, inplace=True)
data.drop(['discourse_id', 'essay_id'], axis=1, inplace=True)

In [None]:
def set_seed(seed=CONFIG['seed']):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [None]:
# class ModelEmaV2(nn.Module):
#     def __init__(self, model, decay=0.9995, device=None):
#         super(ModelEmaV2, self).__init__()
#         # make a copy of the model for accumulating moving average of weights
#         self.module = deepcopy(model)
#         self.module.eval()
#         self.decay = decay
#         self.device = device  # perform ema on different device from model if set
#         if self.device is not None:
#             self.module.to(device=device)

#     def _update(self, model, update_fn):
#         with torch.no_grad():
#             for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
#                 if self.device is not None:
#                     model_v = model_v.to(device=self.device)
#                 ema_v.copy_(update_fn(ema_v, model_v))

#     def update(self, model):
#         self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)

#     def set(self, model):
#         self._update(model, update_fn=lambda e, m: m)

In [None]:
class FeedbackDataset(Dataset):
    def __init__(self, tokenizer, essay_text, discourse_type, discourse_text, effectiveness):
        self.essay_text = essay_text
        self.discourse_type = discourse_type
        self.discourse_text = discourse_text
        self.target = effectiveness
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.discourse_text)
    
    def __getitem__(self, idx):
        input_discourse = self.discourse_type[idx] + ' ' + self.tokenizer.sep_token + ' ' + self.discourse_text[idx]
        tokenized_discourse = self.tokenizer.encode_plus(
                            input_discourse,
                            return_token_type_ids=False,
                            return_attention_mask=True,
                            max_length=512,
                            truncation=True,
                            padding='max_length',
                            add_special_tokens=True,
                            return_tensors='pt'
                        )
        tokenized_essay = self.tokenizer.encode_plus(
                            self.essay_text[idx],
                            return_token_type_ids=False,
                            return_attention_mask=True,
                            max_length=512,
                            truncation=True,
                            padding='max_length',
                            add_special_tokens=True,
                            return_tensors='pt',
                        )
        return {
            'discourse_input_ids': tokenized_discourse['input_ids'].flatten(),
            'discourse_attention_mask': tokenized_discourse['attention_mask'].flatten(),
            'essay_input_ids': tokenized_essay['input_ids'].flatten(),
            'essay_attention_mask': tokenized_essay['attention_mask'].flatten(),
            'target': self.target[idx]
        }

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
# from transformers import GPT2Tokenizer
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

In [None]:
class MeanPoolingLayer(nn.Module):
    def __init__(self):
        super(MeanPoolingLayer, self).__init__()
    
    def forward(self, last_hidden_state, attention_mask):
        expanded_mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        mask_sum = expanded_mask.sum(1)
        mask_sum = torch.clamp(mask_sum, min=1e-9)
        masked_hidden_state = torch.sum(last_hidden_state * expanded_mask, 1)
        return masked_hidden_state / mask_sum

In [None]:
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

In [None]:
class DiscourseEffectivenessModel(nn.Module):
    def __init__(self, num_classes=3, config_path=None):
        super(DiscourseEffectivenessModel, self).__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(CONFIG['model_name'], output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        self.num_classes = num_classes
#         self.pretrained_layer = AutoModel.from_pretrained(CONFIG['pretrained_model_path'])
        self.pretrained_layer = AutoModel.from_pretrained(CONFIG['model_name'])
        
        self.pooler = MeanPoolingLayer()
#         self.pooler = WeightedLayerPooling()
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
        self.dropout = nn.Dropout(p=0.3)
        self.fc = nn.Linear(3*self.pretrained_layer.config.hidden_size, num_classes)
    
    def forward(self, discourse_input_ids, discourse_attention_mask, essay_input_ids, essay_attention_mask):
        discourse_out = self.pretrained_layer(input_ids=discourse_input_ids, attention_mask=discourse_attention_mask)
        discourse_emb = self.pooler(discourse_out.last_hidden_state, discourse_attention_mask)
        essay_out = self.pretrained_layer(input_ids=essay_input_ids, attention_mask=essay_attention_mask)
        essay_emb = self.pooler(essay_out.last_hidden_state, essay_attention_mask)
        concat_emb = torch.cat([discourse_emb, essay_emb, torch.abs(essay_emb - discourse_emb)], dim=-1)
        x = self.dropout(concat_emb)
        x = self.fc(x)
        return x
        

In [None]:
model = DiscourseEffectivenessModel().to(CONFIG['device'])
# model_ema = ModelEmaV2(model)
optimizer = AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])
loss = nn.CrossEntropyLoss().to(CONFIG['device'])
        

In [None]:
def train_loop(model, dataloader, loss, optimizer, scheduler, model_ema=None):
    model.train()
    batch_losses = []
    scaler = torch.cuda.amp.GradScaler()
    for batch_num, batch in tqdm(enumerate(dataloader)):
        discourse_input_ids = batch['discourse_input_ids'].to(CONFIG['device'], non_blocking=True)
        discourse_attention_mask = batch['discourse_attention_mask'].to(CONFIG['device'], non_blocking=True)
        essay_input_ids = batch['essay_input_ids'].to(CONFIG['device'], non_blocking=True)
        essay_attention_mask = batch['essay_attention_mask'].to(CONFIG['device'], non_blocking=True)
        targets = batch['target'].to(CONFIG['device'], non_blocking=True)
        with torch.cuda.amp.autocast():
            logits = model(discourse_input_ids, discourse_attention_mask, essay_input_ids, essay_attention_mask)
            probs = nn.Softmax(dim=1)(logits)
            
        batch_loss = loss(logits, targets)
        batch_loss = batch_loss / CONFIG['n_accumulate']
        batch_losses.append(batch_loss.item())
        scaler.scale(batch_loss).backward()
        if (batch_num+1) % CONFIG['n_accumulate'] == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            if CONFIG['batch_scheduler']:
                scheduler.step()
        
        optimizer.step()
#         if model_ema is not None:
#             model_ema.update(model)
    return np.mean(batch_losses)
        

In [None]:
def validation_loop(model, loss, dataloader):
    model.eval()
    batch_losses = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            discourse_input_ids = batch['discourse_input_ids'].to(CONFIG['device'], non_blocking=True)
            discourse_attention_mask = batch['discourse_attention_mask'].to(CONFIG['device'], non_blocking=True)
            essay_input_ids = batch['essay_input_ids'].to(CONFIG['device'], non_blocking=True)
            essay_attention_mask = batch['essay_attention_mask'].to(CONFIG['device'], non_blocking=True)
            targets = batch['target'].to(CONFIG['device'], non_blocking=True)
#             with torch.cuda.amp.autocast():
            logits = model(discourse_input_ids, discourse_attention_mask, essay_input_ids, essay_attention_mask)
            probs = nn.Softmax(dim=1)(logits)
            batch_loss = loss(logits, targets)
            batch_losses.append(batch_loss.item())
    
    return np.mean(batch_losses)

In [None]:
history = defaultdict(list)
best_loss = np.inf
prev_loss = np.inf
earlystop_trigger = 0
epochs = CONFIG['epochs']
for fold in CONFIG['folds']:
    train_data = data[data['fold'] != fold].reset_index(drop=True)
    val_data = data[data['fold'] == fold].reset_index(drop=True)
    train_dataset = FeedbackDataset(tokenizer, 
                                train_data['essay_text'].values,
                                train_data['discourse_type'].values, 
                                train_data['discourse_text'].values, 
                                train_data['discourse_effectiveness'].values)
    val_dataset = FeedbackDataset(tokenizer, 
                              val_data['essay_text'].values,
                              val_data['discourse_type'].values, 
                              val_data['discourse_text'].values, 
                              val_data['discourse_effectiveness'].values)
    train_dataloader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, pin_memory=True)
    val_dataloader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, pin_memory=True)
    
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg['scheduler'] == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg['num_warmup_steps'], num_training_steps=num_train_steps
            )
        elif cfg['scheduler'] == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg['num_warmup_steps'], num_training_steps=num_train_steps, num_cycles=cfg['num_cycles']
            )
        return scheduler
    
    num_train_steps = int(len(train_data) / CONFIG['batch_size'] * CONFIG['epochs'])
    scheduler = get_scheduler(CONFIG, optimizer, num_train_steps)
    
    print('Training Start ......')
    for epoch in range(CONFIG['epochs']):
        print(f'Epoch {epoch+1} of {epochs}')
        train_loss = train_loop(model, train_dataloader, loss, optimizer, scheduler)
        print(f'Training Loss: {train_loss}')
        val_loss = validation_loop(model, loss, val_dataloader)
        print(f'Val Loss: {val_loss}')
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        
        if val_loss <= prev_loss:
            earlystop_trigger = 0
        else:
            earlystop_trigger += 1
            if earlystop_trigger >= CONFIG['patience']:
                print(f'Early Stopping Triggered after {epoch+1} epochs. Aborting Training...')
                break
        if val_loss < best_loss:
            print(f'New best val loss, saving model checkpoint at epoch {epoch+1}.')
            torch.save(model.state_dict(), CONFIG['saved_model_path'])
            best_loss = val_loss
        prev_loss = val_loss

In [None]:
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.legend()
plt.grid()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Diagnostic Curves')
plt.show()