**Based on the training notebook:** https://www.kaggle.com/code/raj26000/pytorch-feedback-deberta-base-training

In [1]:
import pandas as pd
import os
import gc
import time
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset

In [2]:
CONFIG = {
    'pretrained_config_path': '../input/feedback-pretrain-deb-v3-base/',
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'train': False
}

In [3]:
def fetch_essay_texts(df, train=True):
    if train:
        base_path = '../input/feedback-prize-effectiveness/train/'
    else:
        base_path = '../input/feedback-prize-effectiveness/test/'
        
    essay_texts = {}
    for filename in os.listdir(base_path):
        with open(base_path + filename) as f:
            text = f.readlines()
            full_text = ' '.join([x for x in text])
            essay_text = ' '.join([x for x in full_text.split()])
        essay_texts[filename[:-4]] = essay_text
    df['essay_text'] = [essay_texts[essay_id] for essay_id in df['essay_id'].values]   
    return df

In [4]:
if CONFIG['train']:
    data = pd.read_csv('../input/feedback-stratified-folds-disctype-eff/feedback_train_folds.csv')
else:
    data = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')
data = fetch_essay_texts(data, train=CONFIG['train'])
tokenizer = AutoTokenizer.from_pretrained(CONFIG['pretrained_config_path'])

In [5]:
data.shape

(10, 5)

In [6]:
class MeanPoolingLayer(nn.Module):
    def __init__(self):
        super(MeanPoolingLayer, self).__init__()
    
    def forward(self, last_hidden_state, attention_mask):
        expanded_mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        mask_sum = expanded_mask.sum(1)
        mask_sum = torch.clamp(mask_sum, min=1e-9)
        masked_hidden_state = torch.sum(last_hidden_state * expanded_mask, 1)
        return masked_hidden_state / mask_sum

In [7]:
class DiscourseEffectivenessModel(nn.Module):
    def __init__(self, num_classes=3, config_path=None):
        super(DiscourseEffectivenessModel, self).__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained('../input/debertav3base', output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        self.num_classes = num_classes
        self.pretrained_layer = AutoModel.from_pretrained('../input/debertav3base')
        self.pooler = MeanPoolingLayer()
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
        self.dropout = nn.Dropout(p=0.3)
        self.fc = nn.Linear(3*self.pretrained_layer.config.hidden_size, num_classes)
    
    def forward(self, discourse_input_ids, discourse_attention_mask, essay_input_ids, essay_attention_mask):
        discourse_out = self.pretrained_layer(input_ids=discourse_input_ids, attention_mask=discourse_attention_mask)
        discourse_emb = self.pooler(discourse_out.last_hidden_state, discourse_attention_mask)
        essay_out = self.pretrained_layer(input_ids=essay_input_ids, attention_mask=essay_attention_mask)
        essay_emb = self.pooler(essay_out.last_hidden_state, essay_attention_mask)
        concat_emb = torch.cat([discourse_emb, essay_emb, torch.abs(essay_emb - discourse_emb)], dim=-1)
        x = self.dropout(concat_emb)
        x = self.fc(x)
        return x

In [8]:
@torch.no_grad()
def inference(essay_text, discourse_type, discourse_text, model):
    model.eval()
    input_discourse = discourse_type + ' ' + tokenizer.sep_token + ' ' + discourse_text
    tokenized_discourse = tokenizer.encode_plus(
        input_discourse,
        return_token_type_ids=False,
        return_attention_mask=True,
        max_length=512,
        truncation=True,
        padding=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    tokenized_essay = tokenizer.encode_plus(
                            essay_text,
                            return_token_type_ids=False,
                            return_attention_mask=True,
                            max_length=512,
                            truncation=True,
                            padding=True,
                            add_special_tokens=True,
                            return_tensors='pt',
                        )
    discourse_input_ids = tokenized_discourse['input_ids'].to(CONFIG['device'], non_blocking=True)
    discourse_attention_mask = tokenized_discourse['attention_mask'].to(CONFIG['device'], non_blocking=True)
    essay_input_ids = tokenized_essay['input_ids'].to(CONFIG['device'], non_blocking=True)
    essay_attention_mask = tokenized_essay['attention_mask'].to(CONFIG['device'], non_blocking=True)
    with torch.cuda.amp.autocast():
        logits = model(discourse_input_ids, discourse_attention_mask, essay_input_ids, essay_attention_mask)
        probs = nn.Softmax(dim=1)(logits)
    return probs

In [9]:
model_paths = ['../input/feedback-baseline-7/deb_base_512_mlm_fold0_best.pt',
               '../input/feedback-baseline-7/deb_base_512_mlm_fold1_best.pt',
               '../input/feedback-baseline-7/deb_base_512_mlm_fold2_best.pt',
               '../input/feedback-baseline-7/deb_base_512_mlm_fold3_best.pt',
               '../input/feedback-baseline-7/deb_base_512_mlm_fold4_best.pt']

In [10]:
###### Reduce Padding Inference ######

# sort by token num

def sort_df(df):
    input_lengths = []
    tk0 = tqdm(df['discourse_text'].fillna("").values, total=len(df))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=True)['input_ids'])
        input_lengths.append(length)
    df['input_lengths'] = input_lengths
    length_sorted_idx = np.argsort([-len_ for len_ in input_lengths])
    # sort dataframe
    sort_df = df.iloc[length_sorted_idx]
    # calc max_len per batch
    sorted_input_length = sort_df['input_lengths'].values
    batch_max_length = np.zeros_like(sorted_input_length)
    bs = 32
    for i in range((len(sorted_input_length)//bs)+1):
        batch_max_length[i*bs:(i+1)*bs] = np.max(sorted_input_length[i*bs:(i+1)*bs])    
    sort_df['batch_max_length'] = batch_max_length
    return sort_df

In [11]:
ensemble_preds = np.zeros((len(data), 3), dtype=np.float16)
data = sort_df(data)
for i, path in enumerate(model_paths):
    print(f'....inference for fold {i}....')
    model = DiscourseEffectivenessModel().to(CONFIG['device'])
    model.load_state_dict(torch.load(model_paths[i]))
    preds = []
    for essay_text, discourse_type, discourse_text in zip(data['essay_text'].tolist(), data['discourse_type'].tolist(), data['discourse_text'].tolist()):
        probs = inference(essay_text, discourse_type, discourse_text, model)
        preds.append(probs.to('cpu').numpy())
    ensemble_preds += np.concatenate(preds)
    del model, preds
    gc.collect()
    torch.cuda.empty_cache()

ensemble_preds /= len(model_paths)

submission = pd.DataFrame()
submission['discourse_id'] = data['discourse_id']
submission['Adequate'] = ensemble_preds[:, 1]
submission['Effective'] = ensemble_preds[:, 0]
submission['Ineffective'] = ensemble_preds[:, 2]
submission = submission.sort_index()
submission.to_csv('submission.csv', index=False)

  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


....inference for fold 0....


Some weights of the model checkpoint at ../input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


....inference for fold 1....


Some weights of the model checkpoint at ../input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


....inference for fold 2....


Some weights of the model checkpoint at ../input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


....inference for fold 3....


Some weights of the model checkpoint at ../input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


....inference for fold 4....


Some weights of the model checkpoint at ../input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
submission.head()

Unnamed: 0,discourse_id,Adequate,Effective,Ineffective
0,a261b6e14276,0.683594,0.300049,0.016205
1,5a88900e7dc1,0.864258,0.116089,0.019867
2,9790d835736b,0.725586,0.245483,0.029251
3,75ce6d68b67b,0.723145,0.235107,0.041748
4,93578d946723,0.655762,0.287354,0.056488
