In [4]:
import json
import itertools
from pathlib import Path
import numpy as np
import pandas as pd
from re import search
from typing import List, Tuple, Any, Union
import nltk
import torch
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer
from tqdm.auto import tqdm
import collections
from sklearn.utils import shuffle

def load_json(file: str):
    return json.loads(Path(file).read_bytes())

p_data = pd.read_csv("./dataset/train.csv").drop(["Unnamed: 6", "total no.: 7987"], axis=1)

split_ids = load_json('./dataset/splitIds__splitBy-id_stratifyBy-s_train-0.6_valid-0.2_test-0.2_seed-42.json')
# train_data, valid_data, test_data = [p_data[p_data.id.isin(split_ids[split])] for split in ["train", "valid", "test"]]
# print(train_data.shape[0],valid_data.shape[0],test_data.shape[0])

train_data, train2_data, valid_data = [p_data[p_data.id.isin(split_ids[split])] for split in ["train", "valid", "test"]]
print(train_data.shape[0],train2_data.shape[0],valid_data.shape[0])
train_data = pd.concat([train_data, train2_data],axis=0)
print(train_data.shape[0], valid_data.shape[0])

# split_ids = load_json('./dataset/splitIds__nsplits-5_seed-3.json')
# train_data, valid_data = [p_data[p_data.id.isin(split_ids[1][split])] for split in ["train", "valid"]]
# print(p_data.shape[0], train_data.shape[0],valid_data.shape[0])

22919 7728 7699
30647 7699


In [26]:
args = {
    "max_len" : 512,
    "batch_size" : 8,
    "model_name" : "janeel/muppet-roberta-base-finetuned-squad",
    "learning_rate" : 3e-5,
    "warmup_ratio" : 0.06,
    "seed" : 24,
    "split" : "6+22",
    "special" : "shuffle"
}

### Functions

In [7]:
def contains(small, big):
    for i in range(len(big)-len(small)+1):
        for j in range(len(small)):
            if big[i+j] != small[j]:
                break
        else:
            return i, i+len(small)-1
    return False

def keep_continuous(data: pd.DataFrame):
    keep=[]
    for i in range(data.shape[0]):
        qp_not_in_q = data.iloc[i]['q\''][1:-1] not in data.iloc[i]['q'][1:-1]
        rp_not_in_r = data.iloc[i]['r\''][1:-1] not in data.iloc[i]['r'][1:-1]
        if not (qp_not_in_q or rp_not_in_r):
            keep.append(i)
    
    data = data.iloc[keep]
    return data

#format data 
#token type id sepid is in 0 not 1, context 0 question 1
#attention mask有東西的1其他0
#TODO: pad use tokenizer.pad_token_id
def format_data_qp(q: List[int], r: List[int], s: int, qp: List[int], rp: List[int]) -> Tuple[List[int], List[int], List[int], int, int]:
    cls_q_sep = [clsid] + q + [sepid]
    
    q_r_s = [clsid] + q + [sepid] + r + [sepid] + s + [sepid]
    attention_mask = [1 if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    input_id = [q_r_s[_] if _ in range(len(q_r_s)) else padid for _ in range(args['max_len'])]
    
    if contains(qp, q_r_s):
        start_pos, end_pos = contains(qp, q_r_s)
        print('qp:', qp)
        print('q_r_s:', q_r_s)
        print(start_pos, end_pos)
    else:
        start_pos, end_pos = 0, 0
        
    return input_id, attention_mask, start_pos, end_pos

def format_data_rp(q: List[int], r: List[int], s: int, qp: List[int], rp: List[int]) -> Tuple[List[int], List[int], List[int], int, int]:
    cls_q_sep = [clsid] + r + [sepid]
    
    q_r_s = [clsid] + r + [sepid] + q + [sepid] + s + [sepid]
    attention_mask = [1 if _ in range(len(q_r_s)) else 0 for _ in range(args['max_len'])]
    input_id = [q_r_s[_] if _ in range(len(q_r_s)) else padid for _ in range(args['max_len'])]
    
    if contains(rp, q_r_s):
        start_pos, end_pos = contains(rp, q_r_s)
    else:
        start_pos, end_pos = 0, 0
        
    return input_id, attention_mask, start_pos, end_pos

### Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(args["model_name"])
clsid = tokenizer.cls_token_id
sepid = tokenizer.sep_token_id
padid = tokenizer.pad_token_id
def model_tokenize(text: str) -> List[int]:
    text = text.strip('"')
    token_ids = tokenizer(text)["input_ids"]
    return token_ids[1:-1] #without cls sep

In [101]:
tokenizer('This is is a book.', 'This is is a book two.')

{'input_ids': [0, 713, 16, 16, 10, 1040, 4, 2, 2, 713, 16, 16, 10, 1040, 80, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [45]:
def preprocess(data: pd.DataFrame, choice:str):
    data = keep_continuous(data)
    print('ids left:', data['id'].nunique())
    print('instances left', data.shape[0])
    ids = list(data.id)
    Q, R, S, QP, RP = [data[field] for field in ["q", "r", "s", "q'", "r'"]]
    Q, R, QP, RP, S = [list(map(model_tokenize, x)) for x in [Q, R, QP, RP, S]]

    # only keep those Q+R+S < 512 tokens
    count = 0
    keep = []
    for i in range(len(Q)):
        if (len(Q[i])+len(R[i])) > 512-5:
            count += 1
        else:
            keep.append(i)
    print(f"Q+R+S longer than {args['max_len']} tokens:", count, " Remains:",len(keep))
    Q = [Q[i] for i in keep]
    R = [R[i] for i in keep]
    QP = [QP[i] for i in keep]
    RP = [RP[i] for i in keep]
    S = [S[i] for i in keep]
    ids = [ids[i] for i in keep]
    
    #find start end positions make dict
    if choice == 'qp':
        data = list(map(format_data_qp, Q, R, S, QP, RP))
    elif choice == 'rp':
        data = list(map(format_data_rp, Q, R, S, QP, RP))
    else:
        return 'ERROR'
    return 0
    input_list, token_list, attention_list, s_pos, e_pos =[], [], [], [], []
    for i in range(len(data)):
        input_list.append(data[i][0])
        attention_list.append(data[i][1])
        s_pos.append(data[i][2])
        e_pos.append(data[i][3])
        
    data = {
        'input_ids': input_list,
        'attention_masks': attention_list,
        'start_positions': s_pos,
        'end_positions': e_pos
    }
    
    #make dataset
    ds = Dataset.from_dict(data)
    return ds

In [50]:
train_data_qp_done=preprocess(train_data, 'qp')
valid_data_qp_done=preprocess(valid_data, 'qp')

train_data_rp_done=preprocess(train_data, 'rp')
valid_data_rp_done=preprocess(valid_data, 'rp')

ids left: 1
instances left 3
Q+R+S longer than 512 tokens: 0  Remains: 3
qp: [243, 64, 213, 258, 1319, 479, 166, 70, 2980, 479, 85, 16, 99, 47, 109, 19, 24, 14, 3510, 479]
q_r_s: [0, 243, 64, 213, 258, 1319, 479, 166, 70, 2980, 479, 85, 16, 99, 47, 109, 19, 24, 14, 3510, 479, 2, 36948, 479, 2, 3450, 30009, 2]
1 20
qp: [243, 64, 213, 258, 1319, 479, 166, 70, 2980, 479, 85, 16, 99, 47, 109, 19, 24, 14, 3510, 479]
q_r_s: [0, 243, 64, 213, 258, 1319, 479, 166, 70, 2980, 479, 85, 16, 99, 47, 109, 19, 24, 14, 3510, 479, 2, 36948, 479, 2, 3450, 30009, 2]
1 20


### Model / Collator / Trainer

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

model = AutoModelForQuestionAnswering.from_pretrained(args["model_name"])
model_args = TrainingArguments(
    f'{args["model_name"]}-qp-82-b_{args["batch_size"]}-lr_{args["learning_rate"]}',
    evaluation_strategy = "epoch",
    learning_rate=args["learning_rate"],
    per_device_train_batch_size=args['batch_size'],
    per_device_eval_batch_size=args['batch_size'],
    warmup_ratio=args["warmup_ratio"],
    seed=args["seed"],
    num_train_epochs=6,
    weight_decay=0.01,
    gradient_accumulation_steps=4
)


data_collator = default_data_collator

trainer = Trainer(
    model,
    model_args,
    train_dataset=train_data_qp_done,
    eval_dataset=valid_data_qp_done,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
#trainer.save_model(f'aicup-trained-qp-{args["model_name"]}-82')

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(args["model_name"])
model_args = TrainingArguments(
    f'{args["model_name"]}-rp-82-b_{args["batch_size"]}-lr_{args["learning_rate"]}',
    evaluation_strategy = "epoch",
    learning_rate=args["learning_rate"],
    per_device_train_batch_size=args['batch_size'],
    per_device_eval_batch_size=args['batch_size'],
    warmup_ratio=args["warmup_ratio"],
    seed=args["seed"],
    num_train_epochs=6,
    weight_decay=0.01,
    gradient_accumulation_steps=4
)


data_collator = default_data_collator

trainer = Trainer(
    model,
    model_args,
    train_dataset=train_data_rp_done,
    eval_dataset=valid_data_rp_done,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
#trainer.save_model(f'aicup-trained-rp-{args["model_name"]}-82')

### Predict Q'

In [9]:
def leave_unique_id(data: pd.DataFrame):
    ids = []
    index = []
    for i in range(data.shape[0]):
        if data.iloc[i]['id'] not in ids:
            ids.append(data.iloc[i]['id'])
            index.append(i)
    print(len(index), len(ids))
    data = data.iloc[index]
    return data

def format_data_post_qp(q: str, r: str, s: str, ids: str):
    q_r_s =  q + '</s>' + r + '</s>' + s 
    tokenized_q_r_s = tokenizer(q_r_s, return_offsets_mapping=True, padding="max_length", max_length=512, truncation=True)
    
    cls_q_sep =  q
    tokenized_q = tokenizer(cls_q_sep)["input_ids"]
    
    tokenized_q_r_s["example_id"] = ids
    tokenized_q_r_s["offset_mapping"] = [tokenized_q_r_s["offset_mapping"][_] if _ in range(len(tokenized_q)-1) else None for _ in range(len(tokenized_q_r_s["offset_mapping"]))]
    tokenized_q_r_s["offset_mapping"][0] = None
    return tokenized_q_r_s

def format_data_post_rp(q: str, r: str, s: str, ids: str):
    q_r_s =  r + '</s>' + q + '</s>' + s 
    tokenized_q_r_s = tokenizer(q_r_s, return_offsets_mapping=True, padding="max_length", max_length=512, truncation=True)
    
    cls_q_sep =  r
    tokenized_q = tokenizer(cls_q_sep)["input_ids"]
    
    tokenized_q_r_s["example_id"] = ids
    tokenized_q_r_s["offset_mapping"] = [tokenized_q_r_s["offset_mapping"][_] if _ in range(len(tokenized_q)-1) else None for _ in range(len(tokenized_q_r_s["offset_mapping"]))]
    tokenized_q_r_s["offset_mapping"][0] = None
    return tokenized_q_r_s
        
def postprocess(data: pd.DataFrame, choice: str):
    #data = keep_continuous(data) #seems unusefull in predicting
    print('ids left:', data['id'].nunique())
    print('instances left', data.shape[0])
    ids = list(data.id)
    Q, R, S = [data[field] for field in ["q", "r", "s"]]
    Q, R, S = [list(map(lambda x: x.strip('"'), y)) for y in [Q, R, S]]
    
    if choice == 'qp':
        data = list(map(format_data_post_qp, Q, R, S, ids))
    elif choice == 'rp':
        data = list(map(format_data_post_rp, Q, R, S, ids))
    input_list, token_list, attention_list, offset, ex_id =[], [], [], [], []
    
    for i in range(len(data)):
        input_list.append(data[i]["input_ids"])
        attention_list.append(data[i]["attention_mask"])
        offset.append(data[i]["offset_mapping"])
        ex_id.append(data[i]["example_id"])
        
    data = {
        'input_ids': input_list,
        'attention_mask': attention_list,
        'offset_mapping': offset,
        'example_id': ex_id
    }
    
    #make dataset
    ds = Dataset.from_dict(data)
    
    return ds

# load model
def getPredictFromCkpt(ckpt: str, choice: str, test_post):
    model = AutoModelForQuestionAnswering.from_pretrained(f'corrected_models/{args["model_name"]}-{choice}-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}/checkpoint-{ckpt}')
    test_args = TrainingArguments(
        output_dir = f'corrected_models/{args["model_name"]}-{choice}-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}/checkpoint-{ckpt}',
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = args["batch_size"],
        gradient_accumulation_steps=2
    )

    # init trainer
    trainer = Trainer(model = model, args = test_args)
    raw_predictions = trainer.predict(test_post)
    return raw_predictions

# turn raw predictions (start/end span) to strings
def postprocess_qa_predictions(examples, features, raw_predictions, choice, n_best_size = 10, max_answer_length = 510):
    all_start_logits, all_end_logits = raw_predictions
    n_best_size = 10
    predictions = collections.OrderedDict()
    #print(examples.shape[0], len(features))
    # Let's loop over all the examples!
    for example_index in range(examples.shape[0]):
        # Those are the indices of the features associated to the current example.
        valid_answers = []
        
        if choice == 'qp':
            context = examples.iloc[example_index]["q"][1:-1] #strip "
        elif choice == 'rp':
            context = examples.iloc[example_index]["r"][1:-1] #strip "
        #print("Q :", context)
        
        # We grab the predictions of the model for this feature.
        start_logits = all_start_logits[example_index]
        end_logits = all_end_logits[example_index]
        # This is what will allow us to map some the positions in our logits to span of texts in the original
        # context.
        offset_mapping = features[example_index]["offset_mapping"]
        #print(offset_mapping)
        # Update minimum null prediction.
        cls_index = features[example_index]["input_ids"].index(tokenizer.cls_token_id)
        feature_null_score = start_logits[cls_index] + end_logits[cls_index]

        # Go through all possibilities for the `n_best_size` greater start and end logits.
        start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
        end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                # to part of the input_ids that are not in the context.
                if (
                    start_index >= len(offset_mapping)
                    or end_index >= len(offset_mapping)
                    or offset_mapping[start_index] is None
                    or offset_mapping[end_index] is None
                ):
                    continue
                # Don't consider answers with a length that is either < 0 or > max_answer_length.
                if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                    continue

                start_char = offset_mapping[start_index][0]
                end_char = offset_mapping[end_index][1]
                #print(start_index, end_index, start_char, end_char)
                valid_answers.append(
                    {
                        "score": start_logits[start_index] + end_logits[end_index],
                        "text": context[start_char: end_char] # +1 because of the starting "
                    }
                )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        #print("Q':", best_answer["text"])
        #print("=================")
        predictions[examples.iloc[example_index]["id"]] = best_answer["text"]
        
    return predictions

# make gold csv from valid/ test set data
def make_gold_csv(data, output_name):
    test_dropped = data.drop(['q', 'r', 's'], axis=1)
    test_dropped = test_dropped.fillna('')
    test_dropped.to_csv(output_name, header=False)

In [41]:
#make_gold_csv(valid_data,"split82_3_0.csv")

In [27]:
# 1 if we are going to predict the final submission test
real_predict_test = 1
if real_predict_test == 1:
    valid_data = pd.read_csv("./dataset/test.csv")

test_data_unique = leave_unique_id(valid_data)
test_qp_post = postprocess(test_data_unique, 'qp')
test_rp_post = postprocess(test_data_unique, 'rp')

ckpts = ['500', '1000', '1500', '2000', '2500', '3000', '3500']
ckpts = list(range(400, 2800, 200))
if real_predict_test == 1:
    ckpts = ['1600']
for ckpt in ckpts:
    raw_predictions_qp = getPredictFromCkpt(ckpt, 'qp', test_qp_post)
    raw_predictions_rp = getPredictFromCkpt(ckpt, 'rp', test_rp_post)
    
    test_qp_post.set_format(type=test_qp_post.format["type"], columns=list(test_qp_post.features.keys()))
    test_rp_post.set_format(type=test_rp_post.format["type"], columns=list(test_rp_post.features.keys()))
    
    final_predictions_qp = postprocess_qa_predictions(test_data_unique, test_qp_post, raw_predictions_qp.predictions, 'qp')
    final_predictions_rp = postprocess_qa_predictions(test_data_unique, test_rp_post, raw_predictions_rp.predictions, 'rp')
    #print(final_predictions_qp)
    ids, qp, rp = [], [], []
    for k,v in final_predictions_qp.items():
        ids.append(k)
        qp.append(v)
    for k,v in final_predictions_rp.items():
        rp.append(v)

    dict = {'id': ids, "q'": qp, "r'": rp} 
    df = pd.DataFrame(dict) 
    df = df.replace(np.nan, '', regex=True)
    if real_predict_test == 1:
        df.to_csv(f'./outputs/corrected_models/HALF_predict_test_muppet-roberta-base-finetuned-squad-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}-checkpoint-{ckpt}.csv',header=False)
    else:
        df.to_csv(f'./outputs/corrected_models/predict_test_{args["model_name"]}-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}-checkpoint-{ckpt}.csv',header=False)
    print("Predictions to csv done.")

2016 2016
ids left: 2016
instances left 2016
ids left: 2016
instances left 2016


loading configuration file corrected_models/janeel/muppet-roberta-base-finetuned-squad-qp-6+22-b_8-lr_3e-05-warm_0.06-seed_24-shuffle/checkpoint-1600/config.json
Model config RobertaConfig {
  "_name_or_path": "corrected_models/janeel/muppet-roberta-base-finetuned-squad-qp-6+22-b_8-lr_3e-05-warm_0.06-seed_24-shuffle/checkpoint-1600",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights fil

loading configuration file corrected_models/janeel/muppet-roberta-base-finetuned-squad-rp-6+22-b_8-lr_3e-05-warm_0.06-seed_24-shuffle/checkpoint-1600/config.json
Model config RobertaConfig {
  "_name_or_path": "corrected_models/janeel/muppet-roberta-base-finetuned-squad-rp-6+22-b_8-lr_3e-05-warm_0.06-seed_24-shuffle/checkpoint-1600",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights fil

Predictions to csv done.


In [28]:
#ckpt='1600'
how_to_fill_na = 1
if real_predict_test == 1:
    pred_df = pd.read_csv(f'./outputs/corrected_models/HALF_predict_test_muppet-roberta-base-finetuned-squad-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}-checkpoint-{ckpt}.csv', names=["id", "q'", "r'"], dtype=str)
    valid_data = pd.read_csv("./dataset/test.csv")
    print('Any nas?', pred_df.isna().sum().sum())
    
    if how_to_fill_na == 0:
        pred_df.fillna(value='', inplace=True)
    elif how_to_fill_na == 1:
        r, _ = np.where(pred_df.isna())
        print(r, _)
        for i in range(len(r)):
            for j in range(valid_data.shape[0]):
                if valid_data.iloc[j]['id'] == int(pred_df.iloc[r[i]][0]):
                    getq = valid_data.iloc[j]["q"]
                    getr = valid_data.iloc[j]["r"]
                    if _[i] == 1:
                        pred_df.iloc[r[i]]["q'"] = getq
                    elif _[i] == 2:
                        pred_df.iloc[r[i]]["r'"] = getr
                    break
    elif how_to_fill_na == 2:
        helper_df = pd.read_csv('./outputs/others/11-ckwu-roberta_large_cvseed-3_idx-2_lr-3e-05_warmup-6pct_ckpt-3000.csv', names=["id", "q", "r"], dtype=str)
        r, _ = np.where(pred_df.isna())
        print(helper_df.shape[0], helper_df.iloc[0]['id'])
        print(r, _)
        for i in range(len(r)):
            for j in range(helper_df.shape[0]):
                if int(helper_df.iloc[j]['id']) == int(pred_df.iloc[r[i]][0]):
                    getq = helper_df.iloc[j]["q"].replace('"','')
                    getr = helper_df.iloc[j]["r"].replace('"','')
                    if _[i] == 1 and getq != '':
                        pred_df.iloc[r[i]]["q'"] = getq
                    elif _[i] == 2 and getr != '':
                        pred_df.iloc[r[i]]["r'"] = getr
                    break
    # fill na in df
    print('Any nas?', pred_df.isna().sum().sum())

    pred_df = pred_df.rename({"q'": 'q', "r'": 'r'}, axis=1)
    pred_df.loc[:, ['q', 'r']] = pred_df[['q', 'r']].applymap(lambda s: '"' + str(s).strip('"') + '"')
    pred_df.to_csv(f'./outputs/corrected_models/REAL_predict_test_muppet-roberta-base-finetuned-squad-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-checkpoint-{ckpt}.csv', header=True, quotechar='"', index=False, encoding="utf-8")

Any nas? 0
[] []
Any nas? 0


In [112]:
make_gold_csv(valid_data, 'split6+22.csv')

### Evaluation

In [13]:
from pathlib import Path
import json
from typing import Dict
from typing import List, Tuple, Any, Union
from pathlib import Path
import pandas as pd
import nltk
from transformers import EvalPrediction

In [14]:
def nltk_tokenize(text: str, filter_puncts: bool = True) -> List[str]:
    punctuations = set([ch for ch in "!\"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"])
    text = text.strip('"') # NOTE: remove the quotes first
    tokens = nltk.tokenize.word_tokenize(text)
    if filter_puncts:
        tokens = list(filter(lambda t: t not in punctuations, tokens))
    return tokens
    
def longestCommonSubsequence(text1: list, text2: list) -> int:
    if len(text2) > len(text1):
        text1, text2 = text2, text1

    lcs = [[0] * (len(text2) + 1) for _ in range(2)]
    for i in range(1, len(text1)+1):
        for j in range(1, len(text2)+1):
            if text1[i-1] == text2[j-1]:
                lcs[i % 2][j] = lcs[(i-1) % 2][j-1] + 1
            else:
                lcs[i % 2][j] = max(lcs[(i-1) % 2][j], lcs[i % 2][j-1])

    return lcs[len(text1) % 2][len(text2)]

def compute_lcs_score(pred: list, ans: list) -> float:
    intersection = longestCommonSubsequence(pred, ans)
    union = len(pred) + len(ans) - intersection
    if union == 0:
        return 0
    lcs_score = intersection / union
    if (lcs_score < 0) or (lcs_score) > 1:
        raise ValueError("LCS score must be between 0 and 1")
    return lcs_score

def compute_lcs_scores(pred_df: pd.DataFrame, ans_df: pd.DataFrame) -> pd.DataFrame:
    ids, qp_scores, rp_scores = list(), list(), list()
    for _, prow in pred_df.iterrows():
        pid, qp_pred, rp_pred = prow["id"], prow["q'"], prow["r'"]
        qp_pred, rp_pred = [nltk_tokenize(pred) for pred in [qp_pred, rp_pred]]
        ans_rows = ans_df[ans_df.id == pid]

        for _, arow in ans_rows.iterrows():
            qp_ans, rp_ans = arow["q'"], arow["r'"]
            qp_ans, rp_ans = [nltk_tokenize(ans) for ans in [qp_ans, rp_ans]]
            qp_score, rp_score = compute_lcs_score(qp_pred, qp_ans), compute_lcs_score(rp_pred, rp_ans)

            for item, l in zip([pid, qp_score, rp_score], [ids, qp_scores, rp_scores]):
                l.append(item)

    assert ids == ans_df.id.tolist()
    lcs_df = pd.DataFrame(data={
        "id": ids,
        "qp_scores": qp_scores,
        "rp_scores": rp_scores
    })
    return lcs_df

def compute_final_score(lcs_df: pd.DataFrame) -> float:
    lcs_df["total_scores"] = lcs_df["qp_scores"] + lcs_df["rp_scores"]
    max_scores = lcs_df.groupby("id")["total_scores"].max()
    final_score = max_scores.sum() / (2 * len(max_scores))
    if (final_score < 0) or (final_score > 1):
        raise ValueError("The final score must be between 0 and 1, please check the implementation.")
    return final_score

In [15]:
how_to_fill_nas = [0, 1]
# ckpts = ['500', '1000', '1500', '2000', '2500', '3000', '3500']
#ckpts = list(range(400, 2800, 200))
for how_to_fill_na in how_to_fill_nas:
    for ckpt in ckpts:
        pred_df = pd.read_csv(f'./outputs/corrected_models/predict_test_muppet-roberta-base-finetuned-squad-{args["split"]}-b_{args["batch_size"]}-lr_{args["learning_rate"]}-warm_{args["warmup_ratio"]}-seed_{args["seed"]}-{args["special"]}-checkpoint-{ckpt}.csv', names=["id", "q'", "r'"], dtype=str)
        ans_df = pd.read_csv("split6+22.csv", names=["id", "q'", "r'"], dtype=str)
    #     pred_df = pd.read_csv(f'./outputs/predict_test.csv', names=["id", "q'", "r'"], dtype=str)
    #     ans_df = pd.read_csv("test_gold_622.csv", names=["id", "q'", "r'"], dtype=str)

        #print('Any nas?', pred_df.isna().sum().sum())
        if how_to_fill_na == 0:
            pred_df.fillna(value='', inplace=True)
        elif how_to_fill_na == 1:
            r, _ = np.where(pred_df.isna())
            for i in range(len(r)):
                for j in range(valid_data.shape[0]):
                    if valid_data.iloc[j]['id'] == int(pred_df.iloc[r[i]][0]):
                        getq = valid_data.iloc[j]["q"]
                        getr = valid_data.iloc[j]["r"]
                        if _[i] == 1:
                            pred_df.iloc[r[i]][1] = getq
                        elif _[i] == 2:
                            pred_df.iloc[r[i]][2] = getr
                        break
        #print('Any nas?', pred_df.isna().sum().sum())

        if len(pred_df) != len(ans_df.groupby("id").size()):
            raise ValueError("The prediction file must have the same number of rows as the number of unique IDs in the answer file")

        lcs_df = compute_lcs_scores(pred_df, ans_df) # has len(ans_df) rows of lcs_q' and lcs_r'
        final_score = compute_final_score(lcs_df) # derive the final score by "1/2N (\sum_i^N(max_j(score_q' + score_r')))"
        print(f'# {args["model_name"]}-{args["split"]}-ckpt-{ckpt} final score: {final_score}')
    print('#=======fill na with whole=====')

# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-400 final score: 0.7898045707366026
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-600 final score: 0.7930742405174477
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-800 final score: 0.8049952559754687
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1000 final score: 0.8040372669881446
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1200 final score: 0.8045085022138415
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1400 final score: 0.8043813878888371
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1600 final score: 0.806858283762203
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1800 final score: 0.8017252510026983
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2000 final score: 0.8071309043125297
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2200 final score: 0.8044431428409163
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2400 final score: 0.802117535

In [None]:
#### bert-large-uncased 622 #####
#500: 0.5845899012911759
#1000: 0.5673322547362882
#1500: 0.618273462886458
#2000: 0.5929113627103975
#2500: 0.5836752600091908
#3000: 0.5987110975060392
#3500: 0.5981470868846762
#4000: 0.5981470868846762
#4500: 0.5907715976593922

#### bert-base 82 ####
# bert-base-uncased-checkpoint-3000 final test score: 0.73248
# bert-base-uncased-checkpoint-3500 final test score: 0.738717087187761
# bert-base-uncased-checkpoint-4000 final test score: 0.7291726756958699
# bert-base-uncased-checkpoint-4500 final test score: 0.7372288863123244
# bert-base-uncased-checkpoint-5000 final test score: 0.7349707304725923
# bert-base-uncased-checkpoint-5500 final test score: 0.7308097458575068
# bert-base-uncased-checkpoint-6000 final test score: 0.7300140861374874

### roberta-large 82 b_4 lr_1.5e-5 ###
# roberta-large-checkpoint-500 final test score: 0.7257905628659318
# roberta-large-checkpoint-1000 final test score: 0.7568639627720252
# roberta-large-checkpoint-1500 final test score: 0.725855670985741
# roberta-large-checkpoint-2000 final test score: 0.7357508737312826
# roberta-large-checkpoint-2500 final test score: 0.7247474600774396
# roberta-large-checkpoint-3000 final test score: 0.7230775245966534
# roberta-large-checkpoint-3500 final test score: 0.6832446910327563

### roberta-large 82 b_4 lr_3e-5
# roberta-large-checkpoint-500 final test score: 0.7090992655072061
# roberta-large-checkpoint-1000 final test score: 0.7353674714986471
# roberta-large-checkpoint-1500 final test score: 0.7252969458937408
# roberta-large-checkpoint-2000 final test score: 0.7479845181818972
# roberta-large-checkpoint-2500 final test score: 0.7540305270800959
# roberta-large-checkpoint-3000 final test score: 0.7466347808457232
# roberta-large-checkpoint-3500 final test score: 0.7517260866741773
# --- with nan fill
# roberta-large-checkpoint-500 final test score: 0.7247626604212086
# roberta-large-checkpoint-1000 final test score: 0.7368753560158637
# roberta-large-checkpoint-1500 final test score: 0.7308504918469712
# roberta-large-checkpoint-2000 final test score: 0.7501619558881207
# roberta-large-checkpoint-2500 final test score: 0.7560358746736786
# roberta-large-checkpoint-3000 final test score: 0.7493604083679714
# roberta-large-checkpoint-3500 final test score: 0.7546514831383723

### roberta-large 82 b_4 lr_1.5e-5 warm_0.06 seed 42
# roberta-large-checkpoint-600 final test score: 0.7254769792233624
# roberta-large-checkpoint-800 final test score: 0.7509314441409943
# roberta-large-checkpoint-1000 final test score: 0.7375322469505107
# roberta-large-checkpoint-1200 final test score: 0.7422551023687668
# roberta-large-checkpoint-1400 final test score: 0.7531907527812947
# roberta-large-checkpoint-1600 final test score: 0.7368397793987354
# roberta-large-checkpoint-1800 final test score: 0.746870872686432
# roberta-large-checkpoint-2000 final test score: 0.753262026941743
# roberta-large-checkpoint-2200 final test score: 0.762348043931238
# roberta-large-checkpoint-2400 final test score: 0.7523878335254599
# roberta-large-checkpoint-2600 final test score: 0.7545938197649968
# roberta-large-checkpoint-2800 final test score: 0.75427218198298
# roberta-large-checkpoint-3000 final test score: 0.7516087253918616
# roberta-large-checkpoint-3200 final test score: 0.7540679019577344
# roberta-large-checkpoint-3400 final test score: 0.7446163572437777
# roberta-large-checkpoint-3600 final test score: 0.7566421271082923
# --- with nan fill
# roberta-large-checkpoint-600 final test score: 0.7313529963712819
# roberta-large-checkpoint-800 final test score: 0.7545260799321256
# roberta-large-checkpoint-1000 final test score: 0.7380537321403313
# roberta-large-checkpoint-1200 final test score: 0.7453525889456024
# roberta-large-checkpoint-1400 final test score: 0.7588837647477735
# roberta-large-checkpoint-1600 final test score: 0.7379924987069616
# roberta-large-checkpoint-1800 final test score: 0.7499828003916013
# roberta-large-checkpoint-2000 final test score: 0.7542079208475869
# roberta-large-checkpoint-2200 final test score: 0.7635827604037518
# roberta-large-checkpoint-2400 final test score: 0.7547314397176267
# roberta-large-checkpoint-2600 final test score: 0.7569374259571638
# roberta-large-checkpoint-2800 final test score: 0.7617093213975912
# roberta-large-checkpoint-3000 final test score: 0.755765971288782
# roberta-large-checkpoint-3200 final test score: 0.7611585480659026
# roberta-large-checkpoint-3400 final test score: 0.7495593170443721
# roberta-large-checkpoint-3600 final test score: 0.7624908243438382


### roberta-large 622valid b_4 lr_3e-5 warm_0.06 seed 42
### 0 post
# roberta-large-checkpoint-600 final test score: 0.6500376887617668
# roberta-large-checkpoint-800 final test score: 0.6683186190128978
# roberta-large-checkpoint-1000 final test score: 0.7084029250350091
# roberta-large-checkpoint-1200 final test score: 0.7140576600198701
# roberta-large-checkpoint-1400 final test score: 0.7025639686006379
# roberta-large-checkpoint-1600 final test score: 0.7306867263515114
# roberta-large-checkpoint-1800 final test score: 0.6997363305052697
# roberta-large-checkpoint-2000 final test score: 0.713913067718691
# roberta-large-checkpoint-2200 final test score: 0.7351097835486486
# roberta-large-checkpoint-2400 final test score: 0.7564515992217088
# roberta-large-checkpoint-2600 final test score: 0.7623857063025837
# roberta-large-checkpoint-2800 final test score: 0.7641920976533865
# roberta-large-checkpoint-3000 final test score: 0.7704696176038958
# roberta-large-checkpoint-3200 final test score: 0.7689731713834116
# roberta-large-checkpoint-3400 final test score: 0.763272945108771
# roberta-large-checkpoint-3600 final test score: 0.777026525952762
# roberta-large-checkpoint-3800 final test score: 0.7777010637854648
######### remove char after period
# roberta-large-checkpoint-600 final test score: 0.6740745295817693
# roberta-large-checkpoint-800 final test score: 0.6912120244156451
# roberta-large-checkpoint-1000 final test score: 0.7326908515772249
# roberta-large-checkpoint-1200 final test score: 0.7385665228015272
# roberta-large-checkpoint-1400 final test score: 0.7262132584232915
# roberta-large-checkpoint-1600 final test score: 0.7558633975654591
# roberta-large-checkpoint-1800 final test score: 0.725013460127671
# roberta-large-checkpoint-2000 final test score: 0.7392653076153044
# roberta-large-checkpoint-2200 final test score: 0.7600332405505135
# roberta-large-checkpoint-2400 final test score: 0.7810522283949544
# roberta-large-checkpoint-2600 final test score: 0.7881007482584063
# roberta-large-checkpoint-2800 final test score: 0.7906632047340922
# roberta-large-checkpoint-3000 final test score: 0.7944929732379744
# roberta-large-checkpoint-3200 final test score: 0.7935074650035119
# roberta-large-checkpoint-3400 final test score: 0.788747348261974
# roberta-large-checkpoint-3600 final test score: 0.799180242654962
# roberta-large-checkpoint-3800 final test score: 0.8002754357899929
#roberta-large-checkpoint-4000 final test score: 0.795986979623897
# roberta-large-checkpoint-4200 final test score: 0.7961234418364423

### roberta-large 622test b_4 lr_3e-5 warm_0.06 seed 42
# roberta-large-checkpoint-2000 final test score: 0.7279140946133679
# roberta-large-checkpoint-2200 final test score: 0.7429188422388359
# roberta-large-checkpoint-2400 final test score: 0.7642529418099401
# roberta-large-checkpoint-2600 final test score: 0.7654203286241084
# roberta-large-checkpoint-2800 final test score: 0.7679979745272225
# roberta-large-checkpoint-3000 final test score: 0.7808798827981996
# roberta-large-checkpoint-3200 final test score: 0.7774418167785709
# roberta-large-checkpoint-3400 final test score: 0.771937339919355
# roberta-large-checkpoint-3600 final test score: 0.7832835233229676
# roberta-large-checkpoint-3800 final test score: 0.7825181480287842

### roberta-large 622 valid b_4 lr_3e-5 warm_0.06 shuffle corrected seed 42 
#==== raw
# roberta-large-checkpoint-200 final test score: 0.6249067319603748
# roberta-large-checkpoint-400 final test score: 0.7508643564681374
# roberta-large-checkpoint-600 final test score: 0.7638696318401204
# roberta-large-checkpoint-800 final test score: 0.7541996657066029
# roberta-large-checkpoint-1000 final test score: 0.767665948217005
# roberta-large-checkpoint-1200 final test score: 0.7666249742193298
# roberta-large-checkpoint-1400 final test score: 0.7584889936916765
# roberta-large-checkpoint-1600 final test score: 0.7832800173610261
# roberta-large-checkpoint-1800 final test score: 0.7563130996739712
# roberta-large-checkpoint-2000 final test score: 0.7766218727076956
# roberta-large-checkpoint-2200 final test score: 0.7615458247207753
# roberta-large-checkpoint-2400 final test score: 0.7611164912747018
# roberta-large-checkpoint-2600 final test score: 0.7495980436303449
# roberta-large-checkpoint-2800 final test score: 0.7531305793656844
# roberta-large-checkpoint-3000 final test score: 0.7503282049931337
# roberta-large-checkpoint-3200 final test score: 0.7497488083470066
# roberta-large-checkpoint-3400 final test score: 0.7468628734762588
# roberta-large-checkpoint-3600 final test score: 0.73780700308036
# roberta-large-checkpoint-3800 final test score: 0.7378683993988545
#==== fill na with orig <p.s. remove period is useless>
# roberta-large-checkpoint-200 final test score: 0.6312645786034125
# roberta-large-checkpoint-400 final test score: 0.7565967394376918
# roberta-large-checkpoint-600 final test score: 0.779170971359442
# roberta-large-checkpoint-800 final test score: 0.7652257230489458
# roberta-large-checkpoint-1000 final test score: 0.7778082605503085
# roberta-large-checkpoint-1200 final test score: 0.7701981090570195
# roberta-large-checkpoint-1400 final test score: 0.7667329485490181
# roberta-large-checkpoint-1600 final test score: 0.7864791857407051
# roberta-large-checkpoint-1800 final test score: 0.7578354512534279
# roberta-large-checkpoint-2000 final test score: 0.7812647161563059
# roberta-large-checkpoint-2200 final test score: 0.7660047408029417
# roberta-large-checkpoint-2400 final test score: 0.7629594586149209
# roberta-large-checkpoint-2600 final test score: 0.7522053325432061
# roberta-large-checkpoint-2800 final test score: 0.7615965720846072
# roberta-large-checkpoint-3000 final test score: 0.7603165456396559
# roberta-large-checkpoint-3200 final test score: 0.7586114583375456
# roberta-large-checkpoint-3400 final test score: 0.755211742035463
# roberta-large-checkpoint-3600 final test score: 0.7474518808556296
# roberta-large-checkpoint-3800 final test score: 0.7475132771741244

### roberta-large 622 test b_4 lr_3e-5 warm_0.06 shuffle corrected seed 42 
# raw
# roberta-large-checkpoint-400 final test score: 0.757497569412286
# roberta-large-checkpoint-600 final test score: 0.7664141200048084
# roberta-large-checkpoint-800 final test score: 0.7643846334898196
# roberta-large-checkpoint-1000 final test score: 0.7714245732422911
# roberta-large-checkpoint-1200 final test score: 0.7678900789020857
# roberta-large-checkpoint-1400 final test score: 0.7649841945293655
# roberta-large-checkpoint-1600 final test score: 0.7880313027732246
# roberta-large-checkpoint-1800 final test score: 0.7681806911747383
# roberta-large-checkpoint-2000 final test score: 0.7848556635092381
# roberta-large-checkpoint-2200 final test score: 0.7622602800421855
# roberta-large-checkpoint-2400 final test score: 0.7678715310868255
# roberta-large-checkpoint-2600 final test score: 0.7540259758095897
# roberta-large-checkpoint-2800 final test score: 0.7610216214665433
# roberta-large-checkpoint-3000 final test score: 0.756047131549043
# fill na with orig <p.s. remove period is useless>
# roberta-large-checkpoint-400 final test score: 0.7649961672626635
# roberta-large-checkpoint-600 final test score: 0.7838210361872578
# roberta-large-checkpoint-800 final test score: 0.7767477535396258
# roberta-large-checkpoint-1000 final test score: 0.7840407000138133
# roberta-large-checkpoint-1200 final test score: 0.771052574827685
# roberta-large-checkpoint-1400 final test score: 0.7721745679186325
# roberta-large-checkpoint-1600 final test score: 0.7939244426455291
# roberta-large-checkpoint-1800 final test score: 0.7698764619062523
# roberta-large-checkpoint-2000 final test score: 0.7883790070826443
# roberta-large-checkpoint-2200 final test score: 0.7685556064060656
# roberta-large-checkpoint-2400 final test score: 0.770218012184319
# roberta-large-checkpoint-2600 final test score: 0.7597592791156415
# roberta-large-checkpoint-2800 final test score: 0.7700800081365856
# roberta-large-checkpoint-3000 final test score: 0.7673177023569069

# roberta-large 82_3_1 test b_8 lr_3e-5 warm_0.06 shuffle corrected seed 42 
# raw
# roberta-large-82_3_1-ckpt-400 final score: 0.7654934567233056
# roberta-large-82_3_1-ckpt-600 final score: 0.7491009976348183
# roberta-large-82_3_1-ckpt-800 final score: 0.7433897649619771
# roberta-large-82_3_1-ckpt-1000 final score: 0.7449358625638374
# roberta-large-82_3_1-ckpt-1200 final score: 0.7750203592650212
# roberta-large-82_3_1-ckpt-1400 final score: 0.7569425023616523
# roberta-large-82_3_1-ckpt-1600 final score: 0.7757300583214787
# roberta-large-82_3_1-ckpt-1800 final score: 0.7679848981011869
# roberta-large-82_3_1-ckpt-2000 final score: 0.7630122421414384
# roberta-large-82_3_1-ckpt-2200 final score: 0.7688467340317298
# roberta-large-82_3_1-ckpt-2400 final score: 0.769385044338644
# roberta-large-82_3_1-ckpt-2600 final score: 0.7591458679829894
# roberta-large-82_3_1-ckpt-2800 final score: 0.7639454750323195
#=======fill na with whole=====
# roberta-large-82_3_1-ckpt-400 final score: 0.7700512372823377
# roberta-large-82_3_1-ckpt-600 final score: 0.7523299842649315
# roberta-large-82_3_1-ckpt-800 final score: 0.7610256312565378
# roberta-large-82_3_1-ckpt-1000 final score: 0.7833575452485532
# roberta-large-82_3_1-ckpt-1200 final score: 0.7787087199794752
# roberta-large-82_3_1-ckpt-1400 final score: 0.7687006257178984
# roberta-large-82_3_1-ckpt-1600 final score: 0.7799043597215102
# roberta-large-82_3_1-ckpt-1800 final score: 0.7770942483714922
# roberta-large-82_3_1-ckpt-2000 final score: 0.782135095103325
# roberta-large-82_3_1-ckpt-2200 final score: 0.7700982984872993
# roberta-large-82_3_1-ckpt-2400 final score: 0.7719846231772717
# roberta-large-82_3_1-ckpt-2600 final score: 0.7673873642703445
# roberta-large-82_3_1-ckpt-2800 final score: 0.7798807741778196

# roberta-large 82_3_0 test b_8 lr_3e-5 warm_0.06 shuffle corrected seed 42 
# raw
# roberta-large-82_3_0-ckpt-400 final score: 0.7519655552098666
# roberta-large-82_3_0-ckpt-600 final score: 0.7607480517889734
# roberta-large-82_3_0-ckpt-800 final score: 0.7686799960586809
# roberta-large-82_3_0-ckpt-1000 final score: 0.7552695630703136
# roberta-large-82_3_0-ckpt-1200 final score: 0.779041918260717
# roberta-large-82_3_0-ckpt-1400 final score: 0.7610133622889663
# roberta-large-82_3_0-ckpt-1600 final score: 0.7648633883452065
# roberta-large-82_3_0-ckpt-1800 final score: 0.7493947443757345
# roberta-large-82_3_0-ckpt-2000 final score: 0.7620568353519128
# roberta-large-82_3_0-ckpt-2200 final score: 0.756504689717084
# roberta-large-82_3_0-ckpt-2400 final score: 0.7686461644857229
# roberta-large-82_3_0-ckpt-2600 final score: 0.7662339209225864
# roberta-large-82_3_0-ckpt-2800 final score: 0.7625701341092707
# =======fill na with whole=====
# roberta-large-82_3_0-ckpt-400 final score: 0.7572833652679771
# roberta-large-82_3_0-ckpt-600 final score: 0.7686975376544027
# roberta-large-82_3_0-ckpt-800 final score: 0.7812658578460477
# roberta-large-82_3_0-ckpt-1000 final score: 0.7710109601822083
# roberta-large-82_3_0-ckpt-1200 final score: 0.7793548093746094
# roberta-large-82_3_0-ckpt-1400 final score: 0.7639358064626044
# roberta-large-82_3_0-ckpt-1600 final score: 0.7735388317794816
# roberta-large-82_3_0-ckpt-1800 final score: 0.7816834187917098
# roberta-large-82_3_0-ckpt-2000 final score: 0.7789240670929582
# roberta-large-82_3_0-ckpt-2200 final score: 0.7807226655952566
# roberta-large-82_3_0-ckpt-2400 final score: 0.7698081404599644
# roberta-large-82_3_0-ckpt-2600 final score: 0.7681112676059405
# roberta-large-82_3_0-ckpt-2800 final score: 0.7653521886860211

#roberta-large 6+22 test b_8 lr_3e-5 warm_0.06 shuffle corrected seed 42 
# roberta-large-6+22-ckpt-1000 final score: 0.7938537868578737
# roberta-large-6+22-ckpt-1200 final score: 0.7937386959154632
# roberta-large-6+22-ckpt-1400 final score: 0.7945439923747236
# roberta-large-6+22-ckpt-1600 final score: 0.7893022446707817
# roberta-large-6+22-ckpt-1800 final score: 0.7840754552778941
#=======fill na with whole=====
# roberta-large-6+22-ckpt-1000 final score: 0.7938537868578737
# roberta-large-6+22-ckpt-1200 final score: 0.7937386959154632
# roberta-large-6+22-ckpt-1400 final score: 0.7945439923747236
# roberta-large-6+22-ckpt-1600 final score: 0.7896151357846741
# roberta-large-6+22-ckpt-1800 final score: 0.7840754552778941

#roberta-large 6+22 2sep shuffle corrected seed 42 
# roberta-large-6+22-ckpt-400 final score: 0.7241265238737301
# roberta-large-6+22-ckpt-600 final score: 0.7855164767583794
# roberta-large-6+22-ckpt-800 final score: 0.7707064536359541
# roberta-large-6+22-ckpt-1000 final score: 0.7570536677382713
# roberta-large-6+22-ckpt-1200 final score: 0.788497813842746
# roberta-large-6+22-ckpt-1400 final score: 0.7788683772355874
# roberta-large-6+22-ckpt-1600 final score: 0.7779577865589014
# roberta-large-6+22-ckpt-1800 final score: 0.7869460633150134
# roberta-large-6+22-ckpt-2000 final score: 0.7644081397901518
#=======fill na with whole=====
# roberta-large-6+22-ckpt-400 final score: 0.7241265238737301
# roberta-large-6+22-ckpt-600 final score: 0.7855164767583794
# roberta-large-6+22-ckpt-800 final score: 0.7715657659111699
# roberta-large-6+22-ckpt-1000 final score: 0.7570536677382713
# roberta-large-6+22-ckpt-1200 final score: 0.7888107049566384
# roberta-large-6+22-ckpt-1400 final score: 0.7788683772355874
# roberta-large-6+22-ckpt-1600 final score: 0.7779577865589014
# roberta-large-6+22-ckpt-1800 final score: 0.7869460633150134
# roberta-large-6+22-ckpt-2000 final score: 0.7644081397901518

#roberta-large 622 2input shuffle corrected seed 42 valid
# roberta-large-622-ckpt-400 final score: 0.5798392303645943
# roberta-large-622-ckpt-600 final score: 0.5321390578059124
# roberta-large-622-ckpt-800 final score: 0.6845253001491609
# roberta-large-622-ckpt-1000 final score: 0.7854346665974302
# roberta-large-622-ckpt-1200 final score: 0.7354850139916187
# roberta-large-622-ckpt-1400 final score: 0.7433643910913482
# roberta-large-622-ckpt-1600 final score: 0.7644419415086158
# roberta-large-622-ckpt-1800 final score: 0.766091247946565
# roberta-large-622-ckpt-2000 final score: 0.7608942064589528
# roberta-large-622-ckpt-2200 final score: 0.7669459531461037
# roberta-large-622-ckpt-2400 final score: 0.7684356210623894
# roberta-large-622-ckpt-2600 final score: 0.7739133165758102
#=======fill na with whole=====
# roberta-large-622-ckpt-400 final score: 0.7061591841282925
# roberta-large-622-ckpt-600 final score: 0.7045455611931878
# roberta-large-622-ckpt-800 final score: 0.7121300855904636
# roberta-large-622-ckpt-1000 final score: 0.7859745171352401
# roberta-large-622-ckpt-1200 final score: 0.7736435346030568
# roberta-large-622-ckpt-1400 final score: 0.7691571510371267
# roberta-large-622-ckpt-1600 final score: 0.7654667732729589
# roberta-large-622-ckpt-1800 final score: 0.766237055205639
# roberta-large-622-ckpt-2000 final score: 0.7613063069504207
# roberta-large-622-ckpt-2200 final score: 0.7683260083515128
# roberta-large-622-ckpt-2400 final score: 0.768654644842114
# roberta-large-622-ckpt-2600 final score: 0.774205961794098

#roberta-large 622 2input shuffle corrected seed 42 test
# roberta-large-622-ckpt-400 final score: 0.7770712127379288
# roberta-large-622-ckpt-600 final score: 0.7892216469190373
# roberta-large-622-ckpt-800 final score: 0.695481349547146
# roberta-large-622-ckpt-1000 final score: 0.7947943177366188
# roberta-large-622-ckpt-1200 final score: 0.7489569764741351
# roberta-large-622-ckpt-1400 final score: 0.7486675718582444
# roberta-large-622-ckpt-1600 final score: 0.7746306127505268
# roberta-large-622-ckpt-1800 final score: 0.7713150616045515
# roberta-large-622-ckpt-2000 final score: 0.7746301007589347
# roberta-large-622-ckpt-2200 final score: 0.7781460743121015
# roberta-large-622-ckpt-2400 final score: 0.777065636946207
# roberta-large-622-ckpt-2600 final score: 0.7801924167026092
#=======fill na with whole=====
# roberta-large-622-ckpt-400 final score: 0.7780900644275407
# roberta-large-622-ckpt-600 final score: 0.7895241083291333
# roberta-large-622-ckpt-800 final score: 0.7218455129517211
# roberta-large-622-ckpt-1000 final score: 0.7951135327529325
# roberta-large-622-ckpt-1200 final score: 0.7850521502872703
# roberta-large-622-ckpt-1400 final score: 0.7743813346589259
# roberta-large-622-ckpt-1600 final score: 0.7757153019453537
# roberta-large-622-ckpt-1800 final score: 0.7716887585732569
# roberta-large-622-ckpt-2000 final score: 0.7751387625720191
# roberta-large-622-ckpt-2200 final score: 0.7803026105440398
# roberta-large-622-ckpt-2400 final score: 0.7779258223474501
# roberta-large-622-ckpt-2600 final score: 0.7808022892127383

# janeel/muppet-roberta-base-finetuned-squad-6+22 shuffle seed42
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-400 final score: 0.791835906091792
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-600 final score: 0.7847847802544615
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-800 final score: 0.7858727372469431
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1000 final score: 0.7947897598828904
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1200 final score: 0.7978136932439455
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1400 final score: 0.79244138941681
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1600 final score: 0.7975915095342249
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1800 final score: 0.7957700059979252
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2000 final score: 0.789255788341949
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2200 final score: 0.8001138267063729
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2400 final score: 0.793329970804117
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2600 final score: 0.7952500272784631
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2800 final score: 0.793452868630505
#=======fill na with whole=====
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-400 final score: 0.7924697205302017
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-600 final score: 0.7847847802544615
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-800 final score: 0.7873033732107745
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1000 final score: 0.7960128796917424
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1200 final score: 0.7984110308250127
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1400 final score: 0.7942902914534467
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1600 final score: 0.7979044006481172
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1800 final score: 0.7960828971118176
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2000 final score: 0.7903676579769726
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2200 final score: 0.8010240554013325
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2400 final score: 0.7939273083851843
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2600 final score: 0.7963325633715592
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2800 final score: 0.7943630973254646

# janeel/muppet-roberta-base-finetuned-squad-6+22 shuffle seed24
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-400 final score: 0.7898045707366026
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-600 final score: 0.7930742405174477
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-800 final score: 0.8049952559754687
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1000 final score: 0.8040372669881446
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1200 final score: 0.8045085022138415
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1400 final score: 0.8043813878888371
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1600 final score: 0.806858283762203
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1800 final score: 0.8017252510026983
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2000 final score: 0.8071309043125297
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2200 final score: 0.8044431428409163
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2400 final score: 0.8021175356177814
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2600 final score: 0.8044867752201994
#=======fill na with whole=====
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-400 final score: 0.7898045707366026
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-600 final score: 0.7930742405174477
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-800 final score: 0.8049952559754687
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1000 final score: 0.8040372669881446
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1200 final score: 0.8045085022138415
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1400 final score: 0.8043813878888371
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1600 final score: 0.806858283762203
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-1800 final score: 0.8017252510026983
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2000 final score: 0.8071309043125297
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2200 final score: 0.8044431428409163
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2400 final score: 0.8021175356177814
# janeel/muppet-roberta-base-finetuned-squad-6+22-ckpt-2600 final score: 0.8044867752201994

In [None]:
    # remove any urls
#     for i in range(pred_df.shape[0]):
#         while pred_df.iloc[i]["q'"].find("http : //") != -1: # There is a url
#             print(pred_df.iloc[i]["q'"])
#             if pred_df.iloc[i]["q'"].find(" ", 9 + pred_df.iloc[i]["q'"].find("http : //")) != -1: #
#                 pred_df.iloc[i]["q'"] = pred_df.iloc[i]["q'"][:pred_df.iloc[i]["q'"].find("http : //")] + pred_df.iloc[i]["q'"][pred_df.iloc[i]["q'"].find(" ", 9 + pred_df.iloc[i]["q'"].find("http : //"))+1:len(pred_df.iloc[i]["q'"])]
#             else:
#                 pred_df.iloc[i]["q'"] = pred_df.iloc[i]["q'"][:pred_df.iloc[i]["q'"].find("http : //")]
#             print("New: ",pred_df.iloc[i]["q'"])

#     print('Any nas?', pred_df.isna().sum().sum())
    
    # remove the only word after the last period if any
#     for i in range(pred_df.shape[0]):
#         tempq = str(pred_df.iloc[i]["q'"]).rsplit(' ', 1)[0]
#         tempr = str(pred_df.iloc[i]["r'"]).rsplit(' ', 1)[0]
#         pred_df.iloc[i]["q'"] = tempq
#         pred_df.iloc[i]["r'"] = tempr