# 3rd Place Solution - Feedback Prize English Language Learning
This notebook is team "Now You See Me" 3rd place solution to Kaggle's Feedback Prize English Language Learning competition. Team members are
* Amed ( @amedprof )
* CroDoc ( @crodoc )
* Chris Deotte ( @cdeotte )

Our solution is an ensemble of 24 NLP models. There are 19 deberta-v3-large, 2 roberta-large, 2 deberta-v3-base, and 1 RAPIDS SVR. The models are all very diverse because they use different pooling methods, different pretraining, different learning schedules, etc. The ensemble was choosen from 50+ models using hill climbing ensemble technique. Our full solution write up is [here][1]

[1]: https://www.kaggle.com/competitions/feedback-prize-english-language-learning/discussion/369609

# Select Models
Use `NUM_MODELS=24` to include SVR and TF models. Use `NUM_MODEL=22` to use only PyTorch models.

In [None]:
# use either 22 or 24
NUM_MODELS = 24

import numpy as np
np.save('num_models',NUM_MODELS) #because variables will be erased

# SVR Model
from Chris Deotte's notebook [here][1]

[1]: https://www.kaggle.com/code/cdeotte/rapids-svr-cv-0-450-lb-0-44x

In [None]:
if NUM_MODELS==24:
    %run /kaggle/input/fp3ensemblescripts/rapids-svr-cv-0-450.ipynb
    %reset -f
    !mv submission.csv submission_svr1.csv
    import glob
    for f in glob.glob('*'):
        if (not f.startswith('submission'))&(not f.startswith('num_models')):
            !rm -rf {f}
            
import numpy as np #because variables were be erased
NUM_MODELS = np.load('num_models.npy').item() 

# TF Model
from Xiang's notebook [here][2]

[2]: https://www.kaggle.com/code/electro/deberta-layerwiselr-lastlayerreinit-tensorflow

In [None]:
if NUM_MODELS==24:
    !python -W ignore /kaggle/input/fp3ensemblescripts/TF-deberta-v3-base-CV-0-455.py
    !mv submission.csv submission_tf.csv
    import glob
    for f in glob.glob('*'):
        if not f.startswith('submission'):
            !rm -rf {f}

# Transformer Models
trained offline by team "Now You See Me".

In [None]:
import transformers
print('Transformers version',transformers.__version__)

import sys, os,yaml
sys.path.insert(0, "/kaggle/input/amed-github-src2/src")

In [None]:
import torch
import gc 
import json
import joblib
import numpy as np
from tqdm.notebook import tqdm
from data.data_utils import batch_to_device,clean_text
from ml.embeddings import get_embeddings,prediction_lgbm
from train_utils import prediction_step

from types import SimpleNamespace
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

from utils.utils import save_pickle,load_pickle,make_sub,Path,pd
%env TOKENIZERS_PARALLELISM = true

In [None]:
BATCH_SIZE = 4

DEBUG = False
Folds = [0]
fold_name = "fold_k_5_seed_42"

def mcrmse(targets, predictions):
    error = targets - predictions
    squared_error = np.square(error)
    colwise_mse = np.mean(squared_error, axis=0)
    root_colwise_mse = np.sqrt(colwise_mse)
    return np.mean(root_colwise_mse, axis=0)

TARGET = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar','conventions']

if DEBUG:
    test_df = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
    test_df = test_df.merge(pd.read_csv("../input/fp3-blending-weight/df_folds.csv"),how='left',on="text_id")
    test_df = test_df[test_df[fold_name].isin(Folds)].reset_index(drop=True)
else:
    test_df = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv")

print( test_df.shape )
test_df.head(2)

# Infer Transformers

In [None]:
def predict_folder(test_df,folder,name,bs=1,max_len=640,IDX=0):
    f = open(f'{folder}/params.json')
    args = json.load(f)
    args = SimpleNamespace(**args)
    args.model['pretrained_config'] = f"{folder}/config.pth"
    args.model['pretrained_tokenizer'] = f"{folder}/tokenizer/tokenizer"
    args.val_loader['batch_size'] = bs
    args.model['max_len_eval'] = max_len
    args.model['max_len'] = max_len
    args.model['additional_features'] = []
    args.val_loader['num_workers'] = os.cpu_count()-1
    try:
        s = args.model['spans pooling params']
    except:
        args.model['spans pooling params'] = ""
        args.model['spans'] = ""
            
    args.device = 0
    f.close()
    if DEBUG:
        checkpoints = [x.as_posix() for x in (Path(folder)).glob("*.pth") if (('fold_0' in str(x))|('fold0' in str(x)))]
        return prediction_step(args,test_df.copy(),checkpoints=checkpoints,weights=[1/len(checkpoints)]*len(checkpoints))
    else:
        checkpoints = [x.as_posix() for x in (Path(folder)).glob("*.pth") if 'config' not in str(x)]
        if len(checkpoints) != 5:
            print(f'=> ERROR (only {len(checkpoints)}) not 5 folds in',folder)
        prediction_step(args,test_df.copy(),checkpoints=checkpoints,weights=[1/len(checkpoints)]*len(checkpoints)).to_csv(f'submission_{name}.csv',index=False)
        gc.collect()
    
def read_n_sort(path_file):
    df = pd.read_csv(path_file).sort_values('text_id').reset_index(drop=True)
    return df

## Define Map Model Names to Kaggle Datasets

In [None]:
names = ['oof_deberta-v3-large_psl-dv3l-08-10-2022--01_0.4388_bs_1_ml_640.csv',
       'oof_deberta-v3-large_psl-dv3l-01-11-2022--01_0.4390_bs_1_ml_640.csv',
       'oof_deberta-v3-large-squad2_psl-dv3lsq-01-11-2022--01_0.4399_bs_1_ml_640.csv',
       'oof_deberta-large_psl-dl-01-11-2022--01_0.4410_bs_1_ml_640.csv',
       'oof_deberta-v3-base_psl-dv3b-01-11-2022--01_0.4425_bs_1_ml_640.csv',
       'oof_deberta-v3-large-squad2_dv3ls-cls-12-10-2022--02_0.4524_bs_1_ml_640.csv',
       'oof_deberta-v3-large-squad2_dv3ls-cls-14-10-2022--02_0.4525_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-14-10-2022--01_0.4495_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-cls-15-10-2022--03_0.4509_bs_1_ml_640.csv',
       'oof_roberta-large-ner-english_jb-GradNorm-MeanP-02-10-2022--01_0.4571_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-GradNorm-CLS-04-10-2022--04_0.4515_bs_1_ml_640.csv',
       'oof_deberta-v3-large_sentences-att-mp-13-11-2022---01_0.4528_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-mp-fc-15-10-2022--02_0.4503_bs_1_ml_640.csv',
       'oof_deberta-v3-base_psl-dv3b-10-10-2022--02_0.4437_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-GradNorm-MP-04-10-2022--05_0.4492_bs_1_ml_640.csv',
       'oof_deberta-large_MeanP-01-10-2022--01_0.4542_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-mp-15-10-2022--03_0.4508_bs_1_ml_640.csv',
       'oof_deberta-v3-large-squad2_dv3ls-12-10-2022--02_0.4565_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-GradNorm-MeanP-03-10-2022--01_0.4491_bs_1_ml_640.csv',
       'oof_deberta-v3-base_dv3b-GradNorm-MeanP-02-10-2022--01_0.4575_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-12-10-2022--02_0.4522_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-mp-17-10-2022--01_0.4512_bs_1_ml_640.csv',
       'oof_deberta-v3-large_words-att-mp-13-11-2022---01_0.4574_bs_1_ml_640.csv',
       'oof_deberta-v3-large_paragraph-att-mp-13-11-2022---01_0.4527_bs_1_ml_640.csv',
       'oof_deberta-v3-large-squad2_dv3ls-14-10-2022--01_0.4570_bs_1_ml_640.csv',
       'oof_deberta-v3-large_psl-18m-dv3l-21-11-2022--01_0.4396_bs_1_ml_640_cd.csv', 
       'oof_deberta-v3-base_psl-18m-dv3l-21-11-2022--01-base_0.4415_bs_1_ml_640_cd.csv', 
       'oof_deberta-v3-large_MeanP-01-10-2022--01_0.4498_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-mp-fc-15-10-2022--01_0.4516_bs_1_ml_640.csv',
       'oof_deberta-v3-large_psl-dv3l-08-10-2022--02_0.4396_bs_1_ml_640.csv'] 

names += ['oof_deberta-v3-small_clean-deberta-v3-small_0.4567_bs_1_ml_640_cd3.csv', 
          'oof_deberta-v3-small_attention-deberta-v3-small_0.4575_bs_1_ml_640_cd3.csv'] 

names += ['oof_deberta-large-mnli_dmnl-GradNorm-MP-03-10-2022--01_0.4548_bs_1_ml_640.csv',
       'oof_deberta-v3-large-squad2_dv3ls-12-10-2022--01_0.4501_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-12-10-2022--01_0.4500_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-14-10-2022--02_0.4511_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-GradNorm-MeanP-02-10-2022--01_0.4489_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-GradNorm-MeanP-04-10-2022--02_0.4500_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-cls-29-10-2022--01_0.4502_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-cls-29-10-2022--02_0.4509_bs_1_ml_640.csv',
       'oof_deberta-v3-large_dv3l-mp-19-11-2022--02_0.4470_bs_1_ml_640.csv',
       'oof_deberta-v3-large_sentences-max-mp-13-11-2022---01_0.4528_bs_1_ml_640.csv',
       'oof_xlm-roberta-large-finetuned-conll03-english_kaggle_sub_0.4575_bs_1_ml_640.csv',
       'SVR1_OOF_4526.csv', 'SVR2_OOF_4544.csv', 'TF_OOF_4554.csv']

paths = [
            "/kaggle/input/psl-dv3l-08-10-2022-01/psldv3l0810202201",
            "/kaggle/input/psl-dv3l-01-11-2022-01/psldv3l0111202201",
            "/kaggle/input/psl-dv3lsq-01-11-2022-01/psldv3lsq0111202201",
            "/kaggle/input/psl-dl-01-11-2022-01/psldl0111202201",
            "/kaggle/input/psl-dv3b-01-11-2022-01/psldv3b0111202201",
            "/kaggle/input/dv3ls-cls-12-10-2022-02/dv3lscls1210202202",
            "/kaggle/input/dv3ls-cls-14-10-2022-02/dv3lscls1410202202",
            "/kaggle/input/dv3l-14-10-2022-01-0-4495/dv3l1410202201-04495",
            "/kaggle/input/dv3l-cls-15-10-2022-03/dv3lcls1510202203",
            "/kaggle/input/jbgradnormmeanp0210202201",
            "/kaggle/input/dv3l-gradnorm-cls-04-10-2022-04/dv3lgradnormcls0410202204",
            "/kaggle/input/sentences-att-mp-13-11-2022-01/sentencesattmp1311202201",
            "/kaggle/input/dv3l-mp-fc-15-10-2022-02/dv3lmpfc1510202202",
            "/kaggle/input/notebook525f2d2f62/psldv3b1010202202",
            "/kaggle/input/dv3l-gradnorm-mp-04-10-2022-05-0-4494/dv3lgradnormmp0410202205-04494",
            "/kaggle/input/fp3-deberta-large-meanp0110202201",
            "/kaggle/input/dv3l-mp-15-10-2022-03/dv3lmp1510202203",
            "/kaggle/input/dv3ls-12-10-2022-02/dv3ls1210202202",
            "/kaggle/input/dv3lgradnormmeanp0310202201",
            "/kaggle/input/dv3lgradnormmeanp0210202201",
            "/kaggle/input/dv3l-12-10-2022-02/dv3l1210202202",
            "/kaggle/input/dv3l-mp-17-10-2022-01/dv3lmp1710202201",
            "/kaggle/input/words-att-mp-13-11-2022-01/wordsattmp1311202201",
            "/kaggle/input/paragraph-att-mp-13-11-2022-01/paragraphattmp1311202201",
            "/kaggle/input/dv3ls-14-10-2022-01/dv3ls1410202201",
            "/kaggle/input/deberta-v3-large-psl18/debertav3largepsl18", 
            "/kaggle/input/deberta-v3-base-18psl/debertav3base18psl", 
            "/kaggle/input/fp3-deberta-v3-large-meanp0110202201",
            "/kaggle/input/dv3l-mp-fc-15-10-2022-01/dv3lmpfc1510202201",
            "/kaggle/input/psl-dv3l-08-10-2022-02/psldv3l081020222"
] 

paths += ['/kaggle/input/clean-deberta-v3-small/cleandebertav3small', 
          '/kaggle/input/attention-deberta-v3-small/attentiondebertav3small'] 

paths += ['/kaggle/input/dmnlgradnormmp0310202201',
       '/kaggle/input/dv3ls-12-10-2022-01-0-4499/dv3ls1210202201-0',
       '/kaggle/input/dv3l-12-10-2022-01-0-4496/dv3l1210202201-04496',
       '/kaggle/input/dv3l-14-10-2022-02/dv3l1410202202',
       '/kaggle/input/dv3lgradnormmeanp0210202201',
       '/kaggle/input/dv3lgradnormmeanp0410202202',
       '/kaggle/input/dv3l-cls-29-10-2022-01/dv3lcls2910202201',
       '/kaggle/input/dv3l-cls-29-10-2022-02/dv3lcls2910202202',
       '/kaggle/input/dv3l-mp-19-11-2022-02/dv3lmp1911202202',
       '/kaggle/input/sentences-max-mp-13-11-2022-01/sentencesmaxmp1311202201',
       '/kaggle/input/fp3-xlmmeanp0210202201',
       'SVR1_OOF_4526.csv', 'SVR2_OOF_4544.csv', 'TF_OOF_4554.csv']

# NON-PyTorch NLP MODELS
SKIP = ['SVR1_OOF_4526.csv', 'SVR2_OOF_4544.csv', 'TF_OOF_4554.csv','oof_v1169.csv']
skip_names = {'SVR1_OOF_4526.csv':'svr1', 'SVR2_OOF_4544.csv':'svr2', 'TF_OOF_4554.csv':'tf','oof_v1169.csv':'chris'}

name2path = {x:y for x,y in zip(names,paths)}

In [None]:
bss = []
for n in names:
    if ('sentence' in n)|('paragraph' in n)|('word' in n):
        bss.append(1)
    else:
        bss.append(BATCH_SIZE)
name2bs = {x:y for x,y in zip(names,bss)}

import pandas as pd
e_wgt = pd.read_csv(f'/kaggle/input/fp3ensemblescripts/ensemble_{NUM_MODELS}_model_no_psl.csv')
e_wgt.head()

In [None]:
folders = [name2path[x] for x in e_wgt.n.values]
print( folders ,'\n')

weights = list( e_wgt.w.values )
print( weights ,'\n')

BS = [name2bs[x] for x in e_wgt.n.values]
print( BS ,'\n')

names = [f"m{i}" for i in range(len(folders))]
len(folders),np.sum(weights),len(weights)

## Infer PyTorch Models

In [None]:
if DEBUG:
    for i,(folder,name,bs) in enumerate(zip(folders,names,BS)):
        if folder in SKIP: continue
        res = predict_folder(test_df,folder,name,bs=bs,IDX=i)
        print(mcrmse(res.sort_values("text_id")[TARGET].values,test_df.sort_values("text_id")[TARGET].values))
else:
    for i,(folder,name,bs) in enumerate(zip(folders,names,BS)):
        if folder in SKIP: continue
        predict_folder(test_df,folder,name,bs=bs,IDX=i)

# Create Ensemble Submission CSV

In [None]:
test_df = test_df.sort_values('text_id').reset_index(drop=True)

In [None]:
test_df[TARGET] = 0
for f,name,w in zip(folders,names,weights):
    if f in SKIP: 
        nm = skip_names[f]
    else: 
        nm = name
    test_df[TARGET]+=((read_n_sort(f'submission_{nm}.csv')[TARGET].values)*w)
    os.remove(f'submission_{nm}.csv')

In [None]:
test_df[["text_id"]+TARGET].to_csv('submission.csv',index=False)
test_df[["text_id"]+TARGET].head()