In [None]:
import warnings
warnings.filterwarnings("ignore")

### Yev Submission

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import codecs
from text_unidecode import unidecode
from typing import Tuple

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def replace_newline(text):
    text = text.replace('\n', '[BR]')
    return text

placeholders_replacements = {
    'Generic_School': '[GENERIC_SCHOOL]',
    'Generic_school': '[GENERIC_SCHOOL]',
    'SCHOOL_NAME': '[SCHOOL_NAME]',
    'STUDENT_NAME': '[STUDENT_NAME]',
    'Generic_Name': '[GENERIC_NAME]',
    'Genric_Name': '[GENERIC_NAME]',
    'Generic_City': '[GENERIC_CITY]',
    'LOCATION_NAME': '[LOCATION_NAME]',
    'HOTEL_NAME': '[HOTEL_NAME]',
    'LANGUAGE_NAME': '[LANGUAGE_NAME]',
    'PROPER_NAME': '[PROPER_NAME]',
    'OTHER_NAME': '[OTHER_NAME]',
    'PROEPR_NAME': '[PROPER_NAME]',
    'RESTAURANT_NAME': '[RESTAURANT_NAME]',
    'STORE_NAME': '[STORE_NAME]',
    'TEACHER_NAME': '[TEACHER_NAME]',
}
def replace_placeholders(text):
    for key, value in placeholders_replacements.items():
        text = text.replace(key, value)
    return text


def preprocess_text(text):
    text = resolve_encodings_and_normalize(text)
    text = replace_newline(text)
    text = replace_placeholders(text)
    return text


def get_max_len_from_df(df, tokenizer, n_special_tokens=3):
    lengths = []
    tk0 = tqdm(df['full_text'].fillna("").values, total=len(df))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    max_length = max(lengths) + n_special_tokens
    return max_length


class TestDataset(Dataset):
    def __init__(self, df):
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        inputs = CFG.tokenizer.encode_plus(
            text,
            # corrected_text,
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_len,
            pad_to_max_length=True,
            truncation=True,
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
            
        return inputs
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]
    
    
class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm, dropout_rate, is_lstm=True):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm

        if is_lstm:
            self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        else:
            self.lstm = nn.GRU(self.hidden_size, self.hiddendim_lstm, batch_first=True, bidirectional=True)

        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers + 1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out
    
class ConcatPooling(nn.Module):
    def __init__(self, n_layers=4):
        super(ConcatPooling, self, ).__init__()

        self.n_layers = n_layers

    def forward(self, all_hidden_states):
        concatenate_pooling = torch.cat([all_hidden_states[-(i + 1)] for i in range(self.n_layers)], -1)
        concatenate_pooling = concatenate_pooling[:, 0]
        return concatenate_pooling    


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)
            
        if pretrained:
            self.backbone = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.backbone = AutoModel.from_config(self.config)
        
        if cfg.pooling_type == 'MeanPooling':
            self.pool = MeanPooling()
        elif cfg.pooling_type == 'WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
        elif cfg.pooling_type == 'ConcatPooling':
            self.pool = ConcatPooling(n_layers=cfg.n_layers)
            
        elif cfg.pooling_type == 'LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       self.cfg.hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
        else:
            raise ValueError('Unknown pooling type')
        
        
        if cfg.pooling_type == 'GRUPooling':
            self.fc = nn.Linear(self.cfg.hidden_size, 6)
        elif cfg.pooling_type == 'LSTMPooling':
            self.fc = nn.Linear(self.cfg.hidden_size, 6)
        elif cfg.pooling_type == 'ConcatPooling':
            self.fc = nn.Linear(cfg.n_layers*self.config.hidden_size, 6)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 6)

        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.backbone(**inputs)
        
        last_hidden_states = outputs[0]
        
        if self.cfg.pooling_type == 'MeanPooling':
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling_type == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling_type == 'ConcatPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling_type in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        else:
            raise ValueError('Unknown pooling type')
        
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions



class CFG_model23:
    num_workers=4
    path="../input/model23/"
    model_name = 'model23'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=12
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'MeanPooling'

    
    
class CFG_model52:
    num_workers=4
    path="../input/model52/"
    model_name = 'model52'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=12
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'MeanPooling'
    
    
class CFG_model68:
    num_workers=4
    path="../input/model68/"
    model_name = 'model68'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'WeightedLayerPooling'
    
    
class CFG_model70:
    num_workers=4
    path="../input/model70/"
    model_name = 'model70'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'LSTMPooling'
    hidden_size = 512
    
    
class CFG_model71:
    num_workers=4
    path="../input/model71/"
    model_name = 'model71'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'WeightedLayerPooling'
    
    
class CFG_model75:
    num_workers=4
    path="../input/model75/"
    model_name = 'model75'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'ConcatPooling'
    n_layers = 4    

class CFG_model55:
    num_workers=4
    path="../input/model55/"
    model_name = 'model55'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=12
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'MeanPooling'
    
    
cfg_list = [
    CFG_model23,
    CFG_model52, 
    CFG_model55,
    CFG_model68,
    CFG_model70,
    CFG_model71,
    CFG_model75
]


# cfg_list = [CFG_model23, CFG_model52, CFG_model55]
# cfg_list = [CFG_model23, CFG_model52]


for CFG in cfg_list:
    print(CFG.model_name)
    
    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')
    
    test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
    test['full_text'] = test['full_text'].apply(preprocess_text)
    submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')
    
    # sort by length to speed up inference
    test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['full_text'].values]
    test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
    
    if CFG.set_from_df:
        CFG.max_len = get_max_len_from_df(test, CFG.tokenizer)
    print(CFG.max_len)
    test_dataset = TestDataset(test)

    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                             num_workers=CFG.num_workers, 
                             pin_memory=True, 
                             drop_last=False)

    predictions = []
    for fold in CFG.trn_fold:
        
        if test.shape[0] == 3 and fold > 0:
            continue
        
        model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
        state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
        
    predictions = np.mean(predictions, axis=0)

    test[CFG.target_cols] = predictions
    submission = submission.drop(columns=CFG.target_cols).merge(test[['text_id'] + CFG.target_cols], on='text_id', how='left')
    display(submission.head())
    submission[['text_id'] + CFG.target_cols].to_csv(f'submission_{CFG.model_name}.csv', index=False)

### nischay

In [None]:
import os
import gc
from tqdm.auto import tqdm

import numpy as np 
import pandas as pd 
from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification


from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
from datasets import concatenate_datasets,load_dataset,load_from_disk

from sklearn.metrics import log_loss

from transformers import AutoModel, AutoTokenizer, AdamW, DataCollatorWithPadding

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from datasets import Dataset, load_from_disk
import pickle
import re
from transformers import TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification


class config:
    base_dir = "/kaggle/working/"
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 69
    # dataset path 
    train_dataset_path = "../input/feedback-prize-english-language-learning/train.csv"
    test_dataset_path = "../input/feedback-prize-english-language-learning/test.csv"
    sample_submission_path = "../input/feedback-prize-english-language-learning/sample_submission.csv"
       
    save_dir="../input/colab-models-download-v2-0/"
    
    #tokenizer params
    truncation = True 
    padding = False #'max_length'
    max_length = 512
    
    # model params
    model_name = "microsoft/deberta-v3-large"
    target_cols = ['cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions']
    load_from_disk = None
    
    #training params
    learning_rate = 9e-6
    batch_size = 2
    epochs = 3
    NFOLDS = 5

seed_everything(config.seed)


def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

df_train = pd.read_csv(config.train_dataset_path)
df_test = pd.read_csv(config.test_dataset_path)
df_ss = pd.read_csv(config.sample_submission_path)

df_train['full_text'] = df_train['full_text'].apply(resolve_encodings_and_normalize)
df_test['full_text'] = df_test['full_text'].apply(resolve_encodings_and_normalize)
df_test[config.target_cols] = 0.

tokenizer = AutoTokenizer.from_pretrained("../input/deberta-v3-large/deberta-v3-large")

def tokenize(example):
    text = example["full_text"]

    tokenized = tokenizer(text,
        padding=config.padding,
        truncation=True,
        max_length=config.max_length,
        add_special_tokens=True,
    )

#     print(tokenized)

    tokenized["labels"] = [example[i] for i in config.target_cols]
    tokenized["length"] = len(tokenized["input_ids"])
    
    return tokenized


if config.load_from_disk is None:

  ds = Dataset.from_pandas(df_test)

  ds = ds.map(
      tokenize,
      batched=False,
      num_proc=4,
      desc="Tokenizing",
  )

  ds.save_to_disk(f"{config.base_dir}data.dataset")
  with open(f"{config.base_dir}_pkl", "wb") as fp:
      pickle.dump(df_test, fp)
  print("Saving dataset to disk:", config.save_dir)


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
    
class FeedbackPrizeModel(pl.LightningModule):
    def __init__(self,config):
        super().__init__()
        self.cfg = config

        self.model_config = AutoConfig.from_pretrained('../input/deberta-v3-large/deberta-v3-large/config.json')
        self.model_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.,
                "add_pooling_layer": False,
                "attention_probs_dropout_prob":0,
            }
        )
        
        self.transformers_model = AutoModel.from_pretrained("../input/deberta-v3-large/deberta-v3-large",config=self.model_config)

        # self.classifier =  nn.Linear(self.transformers_model.config.hidden_size,len(self.cfg.target_cols))

        self.layer_norm = nn.LayerNorm(self.transformers_model.config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.transformers_model.config.hidden_size * 2, len(self.cfg.target_cols))   #  + num_external_features

        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers

        self.dropouts = nn.ModuleList([
                nn.Dropout(0.1*i) for i in range(5)
            ])
        self.loss_function = nn.SmoothL1Loss(reduction='mean') 
        
    def forward(self, input_ids, attention_mask,train):
        
        output_backbone = self.transformers_model(input_ids,attention_mask = attention_mask)#[0]
       # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)[:,0,:]

        # print(output_backbone.shape)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.output(output_backbone)
            else:
                logits += self.output(output_backbone)
        
        logits /= len(self.dropouts)
        return (logits, _)
    
        
    def train_dataloader(self):
        return self._train_dataloader 
    
    def validation_dataloader(self):
        return self._validation_dataloader

    def get_optimizer_params(self, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in self.transformers_model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in self.transformers_model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.named_parameters() if "transformers_model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr = config.learning_rate)

        epoch_steps = self.cfg.data_length
        batch_size = self.cfg.batch_size

        warmup_steps = 0.1 * epoch_steps // batch_size
        training_steps = self.cfg.epochs * epoch_steps // batch_size
        # scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,training_steps,-1)
        scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, warmup_steps, training_steps, lr_end=7e-7, power=3.0)

        lr_scheduler_config = {
                'scheduler': scheduler,
                'interval': 'step',
                'frequency': 1,
            }

        return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_config}
    
    
def predict(data_loader, model):
        
    model.to(config.device)
    model.eval()    
    predictions = []
    for batch in tqdm(data_loader):

        with torch.no_grad():
            inputs = {key:val.reshape(val.shape[0], -1).to(config.device) for key,val in batch.items()}
            outputs = model(input_ids = inputs['input_ids'], attention_mask = inputs['attention_mask'],train=False)[0]
        predictions.extend(outputs.detach().cpu().numpy())
    predictions = np.vstack(predictions)
    return predictions



collator = DataCollatorWithPadding(
    tokenizer=tokenizer, pad_to_multiple_of= 16, padding="longest"
)

for fold in range(config.NFOLDS):
    
    if test.shape[0] == 3 and fold > 0:
        continue
    
    train_ds_list = []
    print(f"====== FOLD RUNNING {fold}======")
    
    keep_cols = {"input_ids", "attention_mask", "labels"}

    test_ds = load_from_disk(f'{config.base_dir}data.dataset').sort("length")
    test_ds = test_ds.remove_columns([c for c in test_ds.column_names if c not in keep_cols])
    config.data_length = len(test_ds)

    print('Dataset Loaded....')
    print((test_ds[0].keys()))

    print("Generating Test DataLoader")
    test_dataloader = DataLoader(test_ds, batch_size = config.batch_size, shuffle = False, num_workers= 2, pin_memory=True,collate_fn = collator)

    
    print("Model Creation")

    model = FeedbackPrizeModel.load_from_checkpoint(f'{config.save_dir}microsoft/deberta-v3-large_{fold}.ckpt',train_dataloader=None,validation_dataloader=None,config=config)    
    preds = predict(test_dataloader, model)   

    if fold==0:
        final_preds = preds * (1/config.NFOLDS)
    else:
        final_preds += preds * (1/config.NFOLDS)

    del model,test_dataloader,test_ds
    gc.collect()
    torch.cuda.empty_cache()
    
    
test_ids = []
test_ds = load_from_disk(f'{config.base_dir}data.dataset').sort("length")
for i in test_ds:
    test_ids.append(i['text_id'])

print(final_preds.shape)

sub_df = pd.DataFrame(test_ids,columns={"text_id"})
sub_df[config.target_cols] = final_preds
sub_df.to_csv('submission_nischay.csv',index=False)
print(sub_df.head())

### KOJIMAR

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



    
class CFG1:
    model = "microsoft/deberta-v3-large"
    path = "../input/0926-deberta-v3-large-unscale/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    model_name = '0926-deberta-v3-large'
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base+'tokenizer/')
    gradient_checkpointing=False
    batch_size=6
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG2:
    model = "microsoft/deberta-v2-xlarge-mnli"
    path = "../input/0919-deberta-v2-xlarge-mnli/"
    base = "../input/fb3models/microsoft-deberta-v2-xlarge/"
    config_path = base + "config/config.json"
    model_name = '0919-deberta-v2-xlarge-mnli'
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    
class CFG3:
    model = "microsoft/deberta-v3-large"
    path = "../input/0911-deberta-v3-large/"
    base = "../input/fb3models/microsoft-deberta-v3-large/"
    model_name = '0911-deberta-v3-large'
    config_path = base + "config/config.json"
    tokenizer = AutoTokenizer.from_pretrained(base + 'tokenizer/')
    gradient_checkpointing=False
    batch_size=6
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=10
    trn_fold=list(range(n_fold))
    num_workers=4
    weight = 1.0
    

    
CFG_list = [CFG1, CFG2, CFG3]

# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)


# ====================================================
# oof
# ====================================================
for CFG in CFG_list:
    oof_df = pd.read_pickle(CFG.path+'oof_df.pkl')
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Model: {CFG.model} Score: {score:<.4f}  Scores: {scores}')
    
    
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        #max_length=CFG.max_len,
        #pad_to_max_length=True,
        #truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs
    
    
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
    
class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim = 1)
        return min_embeddings
        

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
    
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


for _idx, CFG in enumerate(CFG_list):
    test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
    submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')
    # sort by length to speed up inference
    test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['full_text'].values]
    test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)

    test_dataset = TestDataset(CFG, test)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    predictions = []
    for fold in CFG.trn_fold:
        if test.shape[0] == 3 and fold > 0:
            continue
        
        model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
        state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
        
    predictions = np.mean(predictions, axis=0)
    test[CFG.target_cols] = predictions
    submission = submission.drop(columns=CFG.target_cols).merge(test[['text_id'] + CFG.target_cols], on='text_id', how='left')
    display(submission.head())
    submission[['text_id'] + CFG.target_cols].to_csv(f'submission_{CFG.model_name}.csv', index=False)
    
    del test, submission, predictions, test_dataset, test_loader; gc.collect()
    torch.cuda.empty_cache()

### Rohit

In [None]:
import codecs
from text_unidecode import unidecode
from typing import Tuple

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def replace_newline(text):
    text = text.replace('\n', '[BR]')
    return text

placeholders_replacements = {
    'Generic_School': '[GENERIC_SCHOOL]',
    'Generic_school': '[GENERIC_SCHOOL]',
    'SCHOOL_NAME': '[SCHOOL_NAME]',
    'STUDENT_NAME': '[STUDENT_NAME]',
    'Generic_Name': '[GENERIC_NAME]',
    'Genric_Name': '[GENERIC_NAME]',
    'Generic_City': '[GENERIC_CITY]',
    'LOCATION_NAME': '[LOCATION_NAME]',
    'HOTEL_NAME': '[HOTEL_NAME]',
    'LANGUAGE_NAME': '[LANGUAGE_NAME]',
    'PROPER_NAME': '[PROPER_NAME]',
    'OTHER_NAME': '[OTHER_NAME]',
    'PROEPR_NAME': '[PROPER_NAME]',
    'RESTAURANT_NAME': '[RESTAURANT_NAME]',
    'STORE_NAME': '[STORE_NAME]',
    'TEACHER_NAME': '[TEACHER_NAME]',
}
def replace_placeholders(text):
    for key, value in placeholders_replacements.items():
        text = text.replace(key, value)
    return text


def pad_punctuation(text):
    text = re.sub('([.,!?()-])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)
    return text


def preprocess_text(text):
    text = resolve_encodings_and_normalize(text)
    # text = replace_newline(text) 
    text = replace_placeholders(text)
#     text = pad_punctuation(text)
    return text

In [None]:
# ====================================================
# Library
# ====================================================

import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


models = [
    {
        "model_name": "microsoft/deberta-v3-base",
        "model_path": "../input/exp01-fb3-part2/microsoft-deberta-v3-base-pl",
        "tok_path": "../input/exp01-fb3-part2/microsoft-deberta-v3-base-pl/tokenizer/",
        "pool": "MeanPool",
        "batch_size": 12,
        'id': 'exp01_fb3_part2',
        'preprocess': False
    },
    
    {
        "model_name": "roberta-large",
        "model_path": "../input/exp14-fb3",
        "tok_path": "../input/exp14-fb3/tokenizer/",
        "pool": "GeM",
        "batch_size": 6,
        'id': 'exp14_fb3_roberta',
        'preprocess': False
    },
    
    {
        "model_name": "microsoft/deberta-v3-large",
        "model_path": "../input/exp13-fb3",
        "tok_path": "../input/exp13-fb3/tokenizer/",
        "pool": "WLP",
        "batch_size": 6,
        'id': 'exp13_fb3',
        'preprocess': False
    },
    {
        "model_name": "distilbert-base-uncased",
        "model_path": "../input/exp02-fb3-part2/distilbert-base-uncased",
        "tok_path": "../input/exp02-fb3-part2/distilbert-base-uncased/tokenizer/",
        "pool": "ConcatPool",
        "batch_size": 12,
        'id': 'exp02_fb3_part2_distilbert',
        'preprocess': False

    },
#     {
#         "model_name": "microsoft/deberta-v3-base",
#         "model_path": "../input/exp01-fb3-trainpl/microsoft-deberta-v3-base",
#         "tok_path": "../input/exp01-fb3-trainpl/microsoft-deberta-v3-base/tokenizer/",
#         "pool": "MeanPool",
#         "batch_size": 12,
#         'id': 'exp01_fb3_pl_train',
#         'preprocess': True
#     },

    
#     {
#         "model_name": "roberta-large",
#         "model_path": "../input/exp14-fb3-trainpl/roberta-large",
#         "tok_path": "../input/exp14-fb3-trainpl/roberta-large/tokenizer/",
#         "pool": "GeM",
#         "batch_size": 6,
#         'id': 'exp14_fb3_pl_train',
#         'preprocess': True
#     },
    
    {
        "model_name": "microsoft/deberta-v3-base",
        "model_path": "../input/exp11-fb3-01/microsoft-deberta-v3-base",
        "tok_path": "../input/exp11-fb3-01/microsoft-deberta-v3-base/tokenizer/",
        "pool": "MeanPool",
        "batch_size": 12,
        'id': 'exp11_fb3_pl_train',
        'preprocess': True
    },

    {
        "model_name": "roberta-large",
        "model_path": "../input/exp11-fb3-02/roberta-large",
        "tok_path": "../input/exp11-fb3-02/roberta-large/tokenizer/",
        "pool": "MeanPool",
        "batch_size": 6,
        'id': 'exp11_fb3_rlarge',
        'preprocess': True
    },
    {
        "model_name": "microsoft/deberta-large",
        "model_path": "../input/exp11-fb3-03/microsoft-deberta-large",
        "tok_path": "../input/exp11-fb3-03/microsoft-deberta-large/tokenizer/",
        "pool": "MeanPool",
        "batch_size": 6,
        'id': 'exp11_fb3_dlarge',
        'preprocess': True
    },
    
# tmp 
#     {
#         "model_name": "microsoft/deberta-v3-large",
#         "model_path": "../input/exp02-fb3",
#         "tok_path": "../input/exp02-fb3/tokenizer/",
#         "pool": "ConcatPool",
#         "batch_size": 12,
#         'id': 'exp02_fb3',
#         'preprocess': False
#     },
    
#     {
#         "model_name": "roberta-large",
#         "model_path": "../input/exp12-fb3",
#         "tok_path": "../input/exp12-fb3/tokenizer/",
#         "pool": "ConcatPool",
#         "batch_size": 6,
#         'id': 'exp12_fb3',
#         'preprocess': False
#     },
    


    
]


num_workers=4
gradient_checkpointing=False
target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
seed=42
n_fold=5
trn_fold=[0, 1, 2, 3, 4]

tokenizer = AutoTokenizer.from_pretrained(f"{models[0]['tok_path']}")

# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# ====================================================
# Data Loading 
# ====================================================
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')

submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

# sort by length to speed up inference
test['tokenize_length'] = [len(tokenizer(text)['input_ids']) for text in test['full_text'].values]
test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
display(test.head())


# ====================================================
# Dataset
# ====================================================
def prepare_input(text,tokenizer, model_type):
    
    if "roberta" in model_type or "distilbert" in model_type :
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=512, 
            pad_to_max_length=True, 
            truncation=True 
        )
    else:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=768, 
            pad_to_max_length=True, 
            truncation=True 
        )
        
    for k, v in inputs.items(): 
        inputs[k] = torch.tensor(v, dtype=torch.long) 
    return inputs 


class TestDataset(Dataset):
    def __init__(self, df,tokenizer, model_type=None):
        self.texts = df['full_text'].values
        self.model_type=model_type
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item],self.tokenizer, self.model_type)
        return inputs
    
    
# MeanPool
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
# WLP 
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, features):
        ft_all_layers = features['all_layer_embeddings']

        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        features.update({'token_embeddings': weighted_average})
        return features

# GeM
class GeMText(nn.Module):
    def __init__(self, dim=1, cfg=None, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1
        # x seeems last hidden state

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = (x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret
    

# ====================================================
# Model
# ====================================================

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg["model_name"], config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        if gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
           
        if cfg["pool"] == "MeanPool" or cfg["pool"] == "ConcatPool":
            self.pooling = MeanPooling()
        elif cfg["pool"] == "WLP":
            self.pooling = WeightedLayerPooling(self.config.num_hidden_layers, layer_start=9)
        elif cfg["pool"] == "GeM":
            self.pooling = GeMText()
            
        if cfg["pool"] == "ConcatPool":
            self.head = nn.Linear(self.config.hidden_size*4, 6)       
        else:
            self.head = nn.Linear(self.config.hidden_size, 6)
            
        if 'facebook/bart' in cfg["model_name"] or 'distilbart' in cfg["model_name"]:
            self.config.use_cache = False
            self.initializer_range = self.config.init_std
        else:
            self.initializer_range = self.config.initializer_range
        
            
            
        self._init_weights(self.head)
        
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        attention_mask = inputs["attention_mask"]
        input_ids = inputs["input_ids"]
        
        if self.cfg["pool"] == "WLP":
            x = self.model(input_ids=input_ids, attention_mask=attention_mask)
            tmp = {
                'all_layer_embeddings': x.hidden_states
            }
            feature = self.pooling(tmp)['token_embeddings'][:, 0]
        
        elif self.cfg["pool"] == "ConcatPool":
            
            
            if 'facebook/bart' in self.cfg["model_name"] or 'distilbart' in self.cfg["model_name"]:
                x = torch.stack(self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).decoder_hidden_states)
            else:
                x = torch.stack(self.model(input_ids=input_ids, attention_mask=attention_mask).hidden_states)
            
            p1 = self.pooling(x[-1], attention_mask)
            p2 = self.pooling(x[-2], attention_mask)
            p3 = self.pooling(x[-3], attention_mask)
            p4 = self.pooling(x[-4], attention_mask)

            feature = torch.cat(
                (p1, p2, p3, p4),-1
            )
        else:
            outputs = self.model(**inputs)
            x = outputs[0]
            feature = self.pooling(x, inputs['attention_mask'])
            
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.head(feature)
        return output
    
    
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


all_preds = []

for cfg in models:
    
    tokenizer = AutoTokenizer.from_pretrained(f"{cfg['tok_path']}")
    
    if cfg['preprocess']:
        test['full_text'] = test['full_text'].apply(preprocess_text)

    
    test_dataset = TestDataset(test, tokenizer, cfg["model_name"])
    batch_size=cfg["batch_size"]

    test_loader = DataLoader(test_dataset,
                         batch_size=batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
                         num_workers=num_workers, pin_memory=True, drop_last=False)
    
    
    predictions = []
    for fold in trn_fold:
        
        if test.shape[0] == 3 and fold > 0:
            continue
        
        model = CustomModel(cfg, config_path=cfg["model_path"]+"/config.pth", pretrained=False)    
        state = torch.load(f"{cfg['model_path']}/checkpoint_{fold}.pth",
                       map_location=torch.device('cpu'))

        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
        
    predictions = np.mean(predictions, axis=0)
    
    test_sub = test.copy()
    test_sub[target_cols] = predictions
    
    test_sub.to_csv(f'submission_{cfg["id"]}.csv', index=False)
    del tokenizer, test_dataset, test_loader; gc.collect()

### Fb_Bart_Large Sub

In [None]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
warnings.filterwarnings("ignore")

from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm

import sys
sys.path.append('../input/iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

train_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/train.csv")
test_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")

train_df["src"]="train"
test_df["src"]="test"

df = pd.concat([train_df, test_df], ignore_index=True)

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]

FOLDS = 25

skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(train_df, train_df[target_cols])):
    train_df.loc[val_index,'fold'] = i
    
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length, tokenizer):
        self.df = df.reset_index(drop=True)
        self.max_length = max_length
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = self.tokenizer(
                        text,
                        None,
                        add_special_tokens=True,
                        padding='max_length',
                        truncation=True,
                        max_length=self.max_length,
                        return_tensors="pt")
        
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens
    
    
def get_embeddings(df, model_name, max_length, batch_size, device, verbose=True):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    dataset = EmbedDataset(df, max_length, tokenizer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False)
    
    model = model.to(device)
    model.eval()
    all_train_text_feats = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        with torch.no_grad():
            model_output = model(input_ids=input_ids, attention_mask=attention_mask)
            
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
        
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Embeddings shape',all_train_text_feats.shape)
        
    return all_train_text_feats


model_name = '../input/d/kozodoi/transformers/facebook-bart-large'

with open(os.path.join('../input/fb3-embeddings/fb_bart_large.npy'), 'rb') as file:
    fb_bart_large_embeddings = np.load(file)
    
fb_bart_large_embeddings_test = get_embeddings(test_df, model_name, max_length=512, batch_size=4, device='cuda',)



all_train_text_feats = np.concatenate([
    fb_bart_large_embeddings,
],axis=1)


te_text_feats = np.concatenate([
    fb_bart_large_embeddings_test,
],axis=1)


gc.collect()
print('Our concatenated embeddings have shape', all_train_text_feats.shape )

from cuml.svm import SVR
import cuml
print('RAPIDS version',cuml.__version__)

from sklearn.metrics import mean_squared_error

preds = []
scores = []
def comp_score(y_true,y_pred):
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

for fold in tqdm(range(FOLDS),total=FOLDS):
# for fold in range(FOLDS):
#     print('# =========================== Fold',fold+1, ' ===========================')
    
    dftr_ = train_df[train_df["fold"]!=fold]
    dfev_ = train_df[train_df["fold"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    ev_preds = np.zeros((len(ev_text_feats),6))
    test_preds = np.zeros((len(te_text_feats),6))
    for i,t in enumerate(target_cols):
        clf = SVR(C=1)
        clf.fit(tr_text_feats, dftr_[t].values)
        ev_preds[:,i] = clf.predict(ev_text_feats)
        test_preds[:,i] = clf.predict(te_text_feats)
        
    score = comp_score(dfev_[target_cols].values,ev_preds)
    scores.append(score)
#     print("Fold : {} RSME score: {}".format(fold,score))
    preds.append(test_preds)
    

print('\nOverall CV RSME =',np.mean(scores))

sub = test_df.copy()

sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
sub_columns = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv").columns
sub = sub[sub_columns]
sub.to_csv("submission_fb_bart_large.csv",index=None)
sub.head()

### ad_v3_roberta_large Sub

In [None]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
warnings.filterwarnings("ignore")

from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm

import sys
sys.path.append('../input/iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

train_df = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
test_df = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv")

train_df["src"]="train"
test_df["src"]="test"

df = pd.concat([train_df, test_df], ignore_index=True)

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]

FOLDS = 25

skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(train_df, train_df[target_cols])):
    train_df.loc[val_index,'fold'] = i
    
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length, tokenizer):
        self.df = df.reset_index(drop=True)
        self.max_length = max_length
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = self.tokenizer(
                        text,
                        None,
                        add_special_tokens=True,
                        padding='max_length',
                        truncation=True,
                        max_length=self.max_length,
                        return_tensors="pt")
        
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens
    
    
def get_embeddings(df, model_name, max_length, batch_size, device, verbose=True):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    dataset = EmbedDataset(df, max_length, tokenizer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False)
    
    model = model.to(device)
    model.eval()
    all_train_text_feats = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        with torch.no_grad():
            model_output = model(input_ids=input_ids, attention_mask=attention_mask)
            
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
        
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Embeddings shape',all_train_text_feats.shape)
        
    return all_train_text_feats


model_name = '/kaggle/input/all-datasets-v3-robertalarge/roberta-large '

with open(os.path.join('../input/fb3-embeddings/ad_v3_roberta_large.npy'), 'rb') as file:
    fb_bart_large_embeddings = np.load(file)
    
fb_bart_large_embeddings_test = get_embeddings(test_df, model_name, max_length=512, batch_size=4, device='cuda',)



all_train_text_feats = np.concatenate([
    fb_bart_large_embeddings,
],axis=1)


te_text_feats = np.concatenate([
    fb_bart_large_embeddings_test,
],axis=1)


gc.collect()
print('Our concatenated embeddings have shape', all_train_text_feats.shape )

from cuml.svm import SVR
import cuml
print('RAPIDS version',cuml.__version__)

from sklearn.metrics import mean_squared_error

preds = []
scores = []
def comp_score(y_true,y_pred):
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

for fold in tqdm(range(FOLDS),total=FOLDS):
# for fold in range(FOLDS):
#     print('# =========================== Fold',fold+1, ' ===========================')
    
    dftr_ = train_df[train_df["fold"]!=fold]
    dfev_ = train_df[train_df["fold"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    ev_preds = np.zeros((len(ev_text_feats),6))
    test_preds = np.zeros((len(te_text_feats),6))
    for i,t in enumerate(target_cols):
        clf = SVR(C=1)
        clf.fit(tr_text_feats, dftr_[t].values)
        ev_preds[:,i] = clf.predict(ev_text_feats)
        test_preds[:,i] = clf.predict(te_text_feats)
        
    score = comp_score(dfev_[target_cols].values,ev_preds)
    scores.append(score)
#     print("Fold : {} RSME score: {}".format(fold,score))
    preds.append(test_preds)
    

print('\nOverall CV RSME =',np.mean(scores))

sub = test_df.copy()

sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
sub_columns = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv").columns
sub = sub[sub_columns]
sub.to_csv("submission_ad_v3_roberta_large.csv",index=None)
sub.head()

### fb_bart_large_mnli Sub

In [None]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
warnings.filterwarnings("ignore")

from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm

import sys
sys.path.append('../input/iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

train_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/train.csv")
test_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")

train_df["src"]="train"
test_df["src"]="test"

df = pd.concat([train_df, test_df], ignore_index=True)

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]

FOLDS = 25

skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(train_df, train_df[target_cols])):
    train_df.loc[val_index,'fold'] = i
    
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length, tokenizer):
        self.df = df.reset_index(drop=True)
        self.max_length = max_length
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = self.tokenizer(
                        text,
                        None,
                        add_special_tokens=True,
                        padding='max_length',
                        truncation=True,
                        max_length=self.max_length,
                        return_tensors="pt")
        
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens
    
    
def get_embeddings(df, model_name, max_length, batch_size, device, verbose=True):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    dataset = EmbedDataset(df, max_length, tokenizer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False)
    
    model = model.to(device)
    model.eval()
    all_train_text_feats = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        with torch.no_grad():
            model_output = model(input_ids=input_ids, attention_mask=attention_mask)
            
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
        
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Embeddings shape',all_train_text_feats.shape)
        
    return all_train_text_feats


model_name = '/kaggle/input/facebook/bart-large-mnli'

with open(os.path.join('/kaggle/input/fb3-embeddings/fb_bart_large_mnli.npy'), 'rb') as file:
    fb_bart_large_embeddings = np.load(file)
    
fb_bart_large_embeddings_test = get_embeddings(test_df, model_name, max_length=512, batch_size=4, device='cuda',)



all_train_text_feats = np.concatenate([
    fb_bart_large_embeddings,
],axis=1)


te_text_feats = np.concatenate([
    fb_bart_large_embeddings_test,
],axis=1)


gc.collect()
print('Our concatenated embeddings have shape', all_train_text_feats.shape )

from cuml.svm import SVR
import cuml
print('RAPIDS version',cuml.__version__)

from sklearn.metrics import mean_squared_error

preds = []
scores = []
def comp_score(y_true,y_pred):
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

for fold in tqdm(range(FOLDS),total=FOLDS):
# for fold in range(FOLDS):
#     print('# =========================== Fold',fold+1, ' ===========================')
    
    dftr_ = train_df[train_df["fold"]!=fold]
    dfev_ = train_df[train_df["fold"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    ev_preds = np.zeros((len(ev_text_feats),6))
    test_preds = np.zeros((len(te_text_feats),6))
    for i,t in enumerate(target_cols):
        clf = SVR(C=1)
        clf.fit(tr_text_feats, dftr_[t].values)
        ev_preds[:,i] = clf.predict(ev_text_feats)
        test_preds[:,i] = clf.predict(te_text_feats)
        
    score = comp_score(dfev_[target_cols].values,ev_preds)
    scores.append(score)
#     print("Fold : {} RSME score: {}".format(fold,score))
    preds.append(test_preds)
    

print('\nOverall CV RSME =',np.mean(scores))

sub = test_df.copy()

sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
sub_columns = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv").columns
sub = sub[sub_columns]
sub.to_csv("submission_fb_bart_large_mnli.csv",index=None)
sub.head()

### all-roberta-large-v1

In [None]:
import numpy as np 
import pandas as pd 
import os, gc, re, warnings
warnings.filterwarnings("ignore")

from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm

import sys
sys.path.append('../input/iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

train_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/train.csv")
test_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")

train_df["src"]="train"
test_df["src"]="test"

df = pd.concat([train_df, test_df], ignore_index=True)

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]

FOLDS = 25

skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(train_df, train_df[target_cols])):
    train_df.loc[val_index,'fold'] = i
    
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length, tokenizer):
        self.df = df.reset_index(drop=True)
        self.max_length = max_length
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = self.tokenizer(
                        text,
                        None,
                        add_special_tokens=True,
                        padding='max_length',
                        truncation=True,
                        max_length=self.max_length,
                        return_tensors="pt")
        
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens
    
    
def get_embeddings(df, model_name, max_length, batch_size, device, verbose=True):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    dataset = EmbedDataset(df, max_length, tokenizer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False)
    
    model = model.to(device)
    model.eval()
    all_train_text_feats = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        with torch.no_grad():
            model_output = model(input_ids=input_ids, attention_mask=attention_mask)
            
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
        
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Embeddings shape',all_train_text_feats.shape)
        
    return all_train_text_feats


model_name = '/kaggle/input/all-roberta-large-v1/all_roberta_large_v1'

with open(os.path.join('/kaggle/input/fb3-embeddings/all_roberta_large_v1.npy'), 'rb') as file:
    fb_bart_large_embeddings = np.load(file)
    
fb_bart_large_embeddings_test = get_embeddings(test_df, model_name, max_length=512, batch_size=4, device='cuda',)



all_train_text_feats = np.concatenate([
    fb_bart_large_embeddings,
],axis=1)


te_text_feats = np.concatenate([
    fb_bart_large_embeddings_test,
],axis=1)


gc.collect()
print('Our concatenated embeddings have shape', all_train_text_feats.shape )

from cuml.svm import SVR
import cuml
print('RAPIDS version',cuml.__version__)

from sklearn.metrics import mean_squared_error

preds = []
scores = []
def comp_score(y_true,y_pred):
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

for fold in tqdm(range(FOLDS),total=FOLDS):
# for fold in range(FOLDS):
#     print('# =========================== Fold',fold+1, ' ===========================')
    
    dftr_ = train_df[train_df["fold"]!=fold]
    dfev_ = train_df[train_df["fold"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    ev_preds = np.zeros((len(ev_text_feats),6))
    test_preds = np.zeros((len(te_text_feats),6))
    for i,t in enumerate(target_cols):
        clf = SVR(C=1)
        clf.fit(tr_text_feats, dftr_[t].values)
        ev_preds[:,i] = clf.predict(ev_text_feats)
        test_preds[:,i] = clf.predict(te_text_feats)
        
    score = comp_score(dfev_[target_cols].values,ev_preds)
    scores.append(score)
#     print("Fold : {} RSME score: {}".format(fold,score))
    preds.append(test_preds)
    

print('\nOverall CV RSME =',np.mean(scores))

sub = test_df.copy()

sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
sub_columns = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv").columns
sub = sub[sub_columns]
sub.to_csv("submission_all_roberta_large_v1.csv",index=None)
sub.head()

### Submission

In [None]:
cohesion_params = {
'w1': 0.0,
'w2': 0.582090873995617,
'w3': 0.08868670946014108,
'w4': 0.0,
'w5': 0.14256297158423278,
'w6': 0.0038073273644528005,
'w7': 0.24051324161351678,
'w8': 0.0,
'w9': 0.0,
'w10': 0.7862496103372187,
'w11': 0.0,
'w12': 0.2681822633546716,
'w13': 0.0,
'w14': 0.0,
'w15': 0.28655998078690104, 
'w16': 0.0,
'w17': 0.13052768112895427,
'w18': 0.2576961605746973,
'w19': 0.01791441665701174,
'w20': 0.5
}


syntax_params = {
'w1': 0.0,
'w2': 0.8466339874016717,
'w3': 0.24699160902252312,
'w4': 0.45172032437366627,
'w5': 0.20230744199065628,
'w6': 0.08572055526000201,
'w7': 0.15072082833029135,
'w8': 0.11290609048162618,
'w9': 0.17578301415435557,
'w10': 0.0,
'w11': 0.21168366686691406,
'w12': 0.3072189141092456,
'w13': 0.13570732112856576,
'w14': 0.0,
'w15': 0.0, 
'w16': 0.0,
'w17': 0.05030880871152592,
'w18': 0.0,
'w19': 0.0043819554664268415,
'w20': 0.5
}


vocabulary_params = {
'w1': 0.0,
'w2': 0.9806452939462846,
'w3': 0.2723605069766004,
'w4': 0.09320920775166273,
'w5': 0.15775977843609892,
'w6': 0.0,
'w7': 0.28650798741718775,
'w8': 0.0,
'w9': 0.0,
'w10': 0.0,
'w11': 0.8348332049215981,
'w12': 0.46847877776107516,
'w13': 0.0,
'w14': 0.0,
'w15': 0.0, 
'w16': 0.21262565761978575,
'w17': 0.0,
'w18': 0.4149318319469712,
'w19': 0.0,
'w20': 0.5
}


phraseology_params = {
'w1': 0.4185029337443942,
'w2': 0.31017696697752817,
'w3': 0.3490430198213265,
'w4': 0.1684639587018389,
'w5': 0.14104578942538834,
'w6': 0.0,
'w7': 0.25846087373557347,
'w8': 0.10550665830893159,
'w9': 0.16239205775383195,
'w10': 0.0,
'w11': 0.6032314132416007,
'w12': 0.08037899037994861,
'w13': 0.1066308024357757,
'w14': 0.0,
'w15': 0.0057755331819348155,
'w16': 0.0,
'w17': 0.08670696928285031,
'w18': 0.11775531837030963,
'w19': 0.008131936097102142,
'w20': 0.5
}



grammar_params = {
'w1': 0.13881704283983315,
'w2': 0.6868627209462645,
'w3': 0.41919000155474695,
'w4': 0.0,
'w5': 0.0,
'w6': 0.0,
'w7': 0.1733616109624987,
'w8': 0.17540674579032667,
'w9':  0.0,
'w10': 0.45361409581385786,
'w11': 0.9999963915756588,
'w12': 0.2560889054054799,
'w13': 0.0,
'w14': 0.5504465764503463,
'w15': 0.9322318619811504,  
'w16': 0.0,
'w17': 0.0,
'w18': 0.0,
'w19': 0.0,
'w20': 0.5
}



conventions_params = {
'w1': 0.5519369189067282,
'w2': 0.7761298427228378,
'w3': 0.0,
'w4': 0.09113458324323657,
'w5': 0.09573713242683393,
'w6': 0.10942092087350164,
'w7': 0.007929047182409905,
'w8': 0.0,
'w9':  0.0,
'w10': 0.44350363206724763,
'w11': 0.20780569898513726,
'w12': 0.09938029141537365,
'w13': 0.23671579507997162,
'w14': 0.0,
'w15': 0.0,  
'w16': 0.03204307159190761,
'w17': 0.15707093503412223,
'w18': 0.0,
'w19': 0.0,
'w20': 0.5
}


In [None]:
best_weights = {
    
'cohesion': {
     'exp01_fb3_part2': cohesion_params['w1'],
     'model23': cohesion_params['w2'],
     '0926-deberta-v3-large': cohesion_params['w3'],
     'nischay': cohesion_params['w4'],
     '0919-deberta-v2-xlarge-mnli': cohesion_params['w5'],
     'exp14_fb3_roberta': cohesion_params['w6'],
     '0911-deberta-v3-large': cohesion_params['w7'],
     'exp13_fb3': cohesion_params['w8'],
     'exp02_fb3_part2_distilbert': cohesion_params['w9'],
     'model52': cohesion_params['w10'],
     'exp11_fb3_pl_train': cohesion_params['w11'],
     'exp11_fb3_rlarge': cohesion_params['w12'],
     'model68': cohesion_params['w13'],
     'model70': cohesion_params['w14'],
     'model71': cohesion_params['w15'],
     'fb_bart_large': cohesion_params['w16'],
     'ad_v3_roberta_large': cohesion_params['w17'],
     'fb_bart_large_mnli': cohesion_params['w18'],
     'all_roberta_large_v1': cohesion_params['w19'],
#      'exp12_fb3': conventions_params['w20'],
    
#      'model75': cohesion_params['w19'],
     'model55': conventions_params['w20'],
        
 },
        
'syntax': {
     'exp01_fb3_part2': syntax_params['w1'],
     'model23': syntax_params['w2'],
     '0926-deberta-v3-large': syntax_params['w3'],
     'nischay': syntax_params['w4'],
     '0919-deberta-v2-xlarge-mnli': syntax_params['w5'],
     'exp14_fb3_roberta': syntax_params['w6'],
     '0911-deberta-v3-large': syntax_params['w7'],
     'exp13_fb3': syntax_params['w8'],
     'exp02_fb3_part2_distilbert': syntax_params['w9'],
     'model52': syntax_params['w10'],
     'exp11_fb3_pl_train': syntax_params['w11'],
     'exp11_fb3_rlarge': syntax_params['w12'],
     'model68': syntax_params['w13'],
     'model70': syntax_params['w14'],
     'model71': syntax_params['w15'],
     'fb_bart_large': syntax_params['w16'],
     'ad_v3_roberta_large': syntax_params['w17'],
     'fb_bart_large_mnli': syntax_params['w18'],
    
     'all_roberta_large_v1': syntax_params['w19'],
#      'exp12_fb3': conventions_params['w20'], 
    
# #      'model75': syntax_params['w19'],
     'model55': conventions_params['w20'],
 },
    
'vocabulary': {
     'exp01_fb3_part2': vocabulary_params['w1'],
     'model23': vocabulary_params['w2'],
     '0926-deberta-v3-large': vocabulary_params['w3'],
     'nischay': vocabulary_params['w4'],
     '0919-deberta-v2-xlarge-mnli': vocabulary_params['w5'],
     'exp14_fb3_roberta': vocabulary_params['w6'],
     '0911-deberta-v3-large': vocabulary_params['w7'],
     'exp13_fb3': vocabulary_params['w8'],
     'exp02_fb3_part2_distilbert': vocabulary_params['w9'],
     'model52': vocabulary_params['w10'],
     'exp11_fb3_pl_train': vocabulary_params['w11'],
     'exp11_fb3_rlarge': vocabulary_params['w12'],
     'model68': vocabulary_params['w13'],
     'model70': vocabulary_params['w14'],
     'model71': vocabulary_params['w15'],
     'fb_bart_large': vocabulary_params['w16'],
     'ad_v3_roberta_large': vocabulary_params['w17'],
     'fb_bart_large_mnli': vocabulary_params['w18'],
    
     'all_roberta_large_v1': vocabulary_params['w19'],
#      'exp12_fb3': conventions_params['w20'],
    
# #      'model75': vocabulary_params['w19'],
     'model55': conventions_params['w20'],
 }, 
    
'phraseology': {
     'exp01_fb3_part2': phraseology_params['w1'],
     'model23': phraseology_params['w2'],
     '0926-deberta-v3-large': phraseology_params['w3'],
     'nischay': phraseology_params['w4'],
     '0919-deberta-v2-xlarge-mnli': phraseology_params['w5'],
     'exp14_fb3_roberta': phraseology_params['w6'],
     '0911-deberta-v3-large': phraseology_params['w7'],
     'exp13_fb3': phraseology_params['w8'],
     'exp02_fb3_part2_distilbert': phraseology_params['w9'],
     'model52': phraseology_params['w10'],
     'exp11_fb3_pl_train': phraseology_params['w11'],
     'exp11_fb3_rlarge': phraseology_params['w12'],
     'model68': phraseology_params['w13'],
     'model70': phraseology_params['w14'],
     'model71': phraseology_params['w15'],
     'fb_bart_large': phraseology_params['w16'],
     'ad_v3_roberta_large': phraseology_params['w17'],
     'fb_bart_large_mnli': phraseology_params['w18'],
    
     'all_roberta_large_v1': phraseology_params['w19'],
#      'exp12_fb3': conventions_params['w20'],
    
# #      'model75': phraseology_params['w19'],
     'model55': conventions_params['w20'],
 },  
    
'grammar': {
     'exp01_fb3_part2': grammar_params['w1'],
     'model23': grammar_params['w2'],
     '0926-deberta-v3-large': grammar_params['w3'],
     'nischay': grammar_params['w4'],
     '0919-deberta-v2-xlarge-mnli': grammar_params['w5'],
     'exp14_fb3_roberta': grammar_params['w6'],
     '0911-deberta-v3-large': grammar_params['w7'],
     'exp13_fb3': grammar_params['w8'],
     'exp02_fb3_part2_distilbert': grammar_params['w9'],
     'model52': grammar_params['w10'],
     'exp11_fb3_pl_train': grammar_params['w11'],
     'exp11_fb3_rlarge': grammar_params['w12'],
     'model68': grammar_params['w13'],
     'model70': grammar_params['w14'],
     'model71': grammar_params['w15'],
     'fb_bart_large': grammar_params['w16'],
     'ad_v3_roberta_large': grammar_params['w17'],
     'fb_bart_large_mnli': grammar_params['w18'],
    
     'all_roberta_large_v1': grammar_params['w19'],
#      'exp12_fb3': conventions_params['w20'],
    
# #      'model75': grammar_params['w19'],
     'model55': conventions_params['w20'],
 },
    
'conventions': {
     'exp01_fb3_part2': conventions_params['w1'],
     'model23': conventions_params['w2'],
     '0926-deberta-v3-large': conventions_params['w3'],
     'nischay': conventions_params['w4'],
     '0919-deberta-v2-xlarge-mnli': conventions_params['w5'],
     'exp14_fb3_roberta': conventions_params['w6'],
     '0911-deberta-v3-large': conventions_params['w7'],
     'exp13_fb3': conventions_params['w8'],
     'exp02_fb3_part2_distilbert': conventions_params['w9'],
     'model52': conventions_params['w10'],
     'exp11_fb3_pl_train': conventions_params['w11'],
     'exp11_fb3_rlarge': conventions_params['w12'],
     'model68': conventions_params['w13'],
     'model70': conventions_params['w14'],
     'model71': conventions_params['w15'],
     'fb_bart_large': conventions_params['w16'],
     'ad_v3_roberta_large': conventions_params['w17'],
     'fb_bart_large_mnli': conventions_params['w18'],
    
     'all_roberta_large_v1': conventions_params['w19'],
#      'exp12_fb3': conventions_params['w20'],
    
# #      'model75': conventions_params['w19'],
     'model55': conventions_params['w20'],
    
}     
}   


In [None]:
model_names = list(best_weights['cohesion'].keys())
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')
submission = submission[['text_id']]

for model in model_names:
    df = pd.read_csv(f'./submission_{model}.csv')
    if 'full_text' in df.columns:
        df.drop('full_text', axis=1, inplace=True)
    df.columns = ['text_id'] + [col+'_'+model for col in df.columns if col != 'text_id']
    submission = pd.merge(submission, df, on='text_id', how='left')

In [None]:
target_columns = ['cohesion', 'syntax', 'vocabulary',
                  'phraseology', 'grammar', 'conventions']

submission[target_columns] = 0

for col in target_columns:
    w_ = best_weights[col]
    for fn, w in w_.items():
        submission[col] += submission[col+'_'+fn] * w
        
    submission[col] = submission[col]/np.sum(list(w_.values()))
    
submission = submission[['text_id'] + target_columns]
submission[target_columns] = submission[target_columns].clip(1, 5)
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()