# About

Trained a `New MLM` and added on [External_Datasets_Matching + MLM](https://www.kaggle.com/chienhsianghung/external-datasets-matching-mlm).


`New MLM`
- [[Coleridge] BERT - MLMv4](https://www.kaggle.com/chienhsianghung/coleridge-bert-mlmv4)


# Setting

In [1]:
COMPUTE_CV = False
EDA_DEMO = False
ALL_BLENDED = False
BASELINE_HELPING = True
MATCH_ONLY = False
MLM_ONLY = False
KEN_MATCHING = True
BS_CLEANING = False
THEO_MERGE = False
SEED = 347

# Install packages

In [2]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

from IPython.display import clear_output
clear_output()

# Import

In [3]:
import os
import re
import json
import time
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
if len(sample_submission) > 4: COMPUTE_CV = False
if COMPUTE_CV: 
    print('this submission notebook will compute CV score but commit notebook will not')
else:
    print('this submission notebook will only be used to submit result')

  if sys.path[0] == '':


this submission notebook will only be used to submit result


# Load data

In [4]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

if COMPUTE_CV: 
    sample_submission = train
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
    test_files_path = paper_test_folder
else:
    sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
    test_files_path = paper_test_folder
    
adnl_govt_labels_path = '../input/bigger-govt-dataset-list/data_set_800.csv'

In [5]:
papers = {}
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




## A Bit EDA

In [6]:
if EDA_DEMO:
    df_input = pd.DataFrame(columns=['id','section_title','text','data_label'])
    for id in tqdm(train['Id'].values):
        df=pd.read_json('../input/coleridgeinitiative-show-us-the-data/train/{}.json'.format(id))    
        for data_label in train[train['Id']==id]['dataset_label'].values:        
            new_df=df[df['text'].str.contains(data_label)].copy(deep=True)
            new_df.loc[:,['data_label']] = data_label
            new_df.loc[:,['id']] = id
            new_df.reset_index(inplace=True,drop=True)
            df_input=pd.concat([df_input, new_df], ignore_index=True)
            df_input.reset_index(inplace=True,drop=True)
else: df_input = None

df_input

In [7]:
if EDA_DEMO: df_input = df_input[ df_input['section_title'] != '' ]

df_input

# Literal Matching

### Create a Knowledge Bank

In [8]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

No. different labels: 180


#### Additional Govt Datasets

In [9]:
adnl_govt_labels = pd.read_csv(adnl_govt_labels_path)

for l in adnl_govt_labels.title:
    all_labels.add(l)
    
all_labels = set(all_labels)
print(f'No. different labels: {len(all_labels)}')

No. different labels: 2159


### Matching on test data

In [10]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

if not BS_CLEANING:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        text = re.sub(' +', ' ', text)
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        return text
else:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = ''.join([k for k in text if k not in string.punctuation])
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        # text = re.sub("/'+/g", ' ', text)
        return text


def read_json_pub(filename, train_data_path=train_files_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [11]:
if not KEN_MATCHING and not MLM_ONLY:
    literal_preds = []
    for paper_id in tqdm(sample_submission['Id']):
        paper = papers[paper_id]
        text_1 = '. '.join(section['text'] for section in paper).lower()
        text_2 = totally_clean_text(text_1)

        labels = set()
        for label in all_labels:
            if label in text_1 or label in text_2:
                labels.add(clean_text(label))

        literal_preds.append('|'.join(labels))
    literal_preds[:5]

#### Ken Matching

In [12]:
literal_preds = []

if KEN_MATCHING and not MLM_ONLY:
    literal_preds = []
    to_append = []
    for index, row in tqdm(sample_submission.iterrows()):
        to_append = [row['Id'],'']
        large_string = str(read_json_pub(row['Id'], test_files_path))
        clean_string = text_cleaning(large_string)
        for index, row2 in adnl_govt_labels.iterrows():
            query_string = str(row2['title'])
            if query_string in clean_string:
                if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
                    to_append[1] = to_append[1] + '|' + clean_text(query_string)
                if to_append[1] == '':
                    to_append[1] = clean_text(query_string)
        literal_preds.append(*to_append[1:])

elif MLM_ONLY:
    print('This kernel will only use MLM model to predict.')

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




# Masked Dataset Modeling

### Paths and Hyperparameters

In [13]:
if not MATCH_ONLY:
    PRETRAINED_PATH = '../input/coleridge-bert-mlmv4/output-mlm/checkpoint-48000'
    TOKENIZER_PATH = '../input/coleridge-bert-mlmv4/model_tokenizer'

    MAX_LENGTH = 64
    OVERLAP = 20

    PREDICT_BATCH = 32 # a higher value requires higher GPU memory usage

    DATASET_SYMBOL = '$' # this symbol represents a dataset name
    NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

# Transform data to MLM format

### Load model and tokenizer

In [14]:
if not MATCH_ONLY:
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)
    model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)

    mlm = pipeline(
        'fill-mask', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

### Auxiliary functions

In [15]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

### Transform

In [16]:
if not MATCH_ONLY:
    mask = mlm.tokenizer.mask_token
    all_test_data = []
    
    for paper_id in tqdm(sample_submission['Id']):
        # load paper
        paper = papers[paper_id]

        # extract sentences
        sentences = set([clean_paper_sentence(sentence) for section in paper 
                         for sentence in section['text'].split('.')
                        ])
        sentences = shorten_sentences(sentences) # make sentences short
        sentences = [sentence for sentence in sentences if len(sentence) > 1] # only accept sentences with length > 1 chars
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        sentences = [sentence.split() for sentence in sentences] # sentence = list of words

        # mask
        test_data = []
        for sentence in sentences:
            for phrase_start, phrase_end in find_mask_candidates(sentence):
                dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
                test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)

        all_test_data.append(test_data)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




### Predict

In [17]:
if not MATCH_ONLY:
    pred_mlm_labels = []

    for test_data in tqdm(all_test_data):
        pred_bag = set()

        if len(test_data):
            texts, phrases = list(zip(*test_data))
            mlm_pred = []
            for p_id in range(0, len(texts), PREDICT_BATCH):
                batch_texts = texts[p_id:p_id+PREDICT_BATCH]
                batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])
               

                if len(batch_texts) == 1:
                    batch_pred = [batch_pred]

                mlm_pred.extend(batch_pred)
                
            for (result1, result2), phrase in zip(mlm_pred, phrases):
                if (result1['score'] > result2['score']*2 and result1['token_str'] == DATASET_SYMBOL) or\
                   (result2['score'] > result1['score']*2 and result2['token_str'] == NONDATA_SYMBOL):
                    pred_bag.add(clean_text(phrase))
                    #print(phrase)
                    print(clean_text(phrase))

        # filter labels by jaccard score 
        filtered_labels = []
        

        for label in sorted(pred_bag, key=len, reverse=True):
            if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered_labels): #기존에 추가돼있는거랑 너무 비슷한거 피하기
                filtered_labels.append(label)

        #pred_mlm_labels.append('|'.join(filtered_labels))
        pred_mlm_labels.append(filtered_labels)
    pred_mlm_labels[:5]

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

lothian birth cohort study lbc1936



In [18]:
pred_mlm_labels

[['lothian birth cohort study lbc1936'], [], [], []]

# Baseline Model

In [19]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data
    
    
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    # text = re.sub("/'+/g", ' ', text)
    return text

In [20]:
if BASELINE_HELPING or ALL_BLENDED:
    tqdm.pandas()

    train['text'] = train['Id'].progress_apply(read_append_return)

    if not COMPUTE_CV:
        sample_submission['text'] = sample_submission['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))

    train.head()

  


HBox(children=(FloatProgress(value=0.0, max=19661.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [21]:
if BASELINE_HELPING or ALL_BLENDED:
    tqdm.pandas()
    
    train['text'] = train['text'].progress_apply(text_cleaning)

HBox(children=(FloatProgress(value=0.0, max=19661.0), HTML(value='')))




In [22]:
if BASELINE_HELPING or ALL_BLENDED:
    temp_1 = [x.lower() for x in train['dataset_label'].unique()]
    temp_2 = [x.lower() for x in train['dataset_title'].unique()]
    temp_3 = [x.lower() for x in train['cleaned_label'].unique()]

    existing_labels = set(temp_1 + temp_2 + temp_3)

    print(f'len(temp_1) = {len(temp_1)}')
    print(f'len(temp_2) = {len(temp_2)}')
    print(f'len(temp_3) = {len(temp_3)}')
    print(f'len(existing_labels) = {len(existing_labels)}')

    id_list = []
    lables_list = []
    for index, row in tqdm(sample_submission.iterrows()):
        sample_text = row['text']
        row_id = row['Id']
        temp_df = train[train['text'] == text_cleaning(sample_text)]
        cleaned_labels = temp_df['cleaned_label'].to_list()

        for known_label in existing_labels:
            if known_label in sample_text.lower():
                cleaned_labels.append(clean_text(known_label))

        cleaned_labels = [clean_text(x) for x in cleaned_labels]
        cleaned_labels = set(cleaned_labels)
        lables_list.append('|'.join(cleaned_labels))
        id_list.append(row_id)

len(temp_1) = 130
len(temp_2) = 45
len(temp_3) = 130
len(existing_labels) = 180


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




# T5 inference

In [23]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
import re
import json
import glob
from collections import defaultdict
from functools import partial
from pathlib import Path
import random
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from tqdm.autonotebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

%matplotlib inline

In [24]:
pl.seed_everything(42)

42

In [25]:
#kaggle 버전

train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'


In [26]:
#해당 id에 해당하는 text를 불러오는 함수 (json파일에서)
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    tmp_df = pd.DataFrame([headings, contents]).T
    tmp_df.columns = ("section_title", "text")
    
    return tmp_df

In [27]:
tqdm.pandas()
train_df["text"] = train_df["Id"].progress_apply(read_append_return)

HBox(children=(FloatProgress(value=0.0, max=19661.0), HTML(value='')))




In [28]:
answers = []
contexts = []
for i in range(len(train_df)):
    tmp = train_df.iloc[i]
    texts = tmp["text"]["text"]
    dataset_title = tmp["dataset_title"]
    for j in range(len(texts)):
        text_j = texts[j]
        contexts.append(text_j)
        if text_j.find(dataset_title) >= 0:
            answers.append(dataset_title)
        else:
            answers.append("None")

df = pd.DataFrame([answers, contexts])
df = df.T

In [29]:
remove_idx = []
for i in range(len(df)):
    tmp = df.iloc[i]
    if tmp[0] != "None":
        continue
    else:
        remove_idx.append(i)

In [30]:
df = df.drop(remove_idx).reset_index(drop=True)

In [31]:
#question col을 추가한 새로운 df 만들기
df["question"] = pd.Series(["what is the dataset?"]*len(df))
df.columns = ['answer_text', 'context', 'question'] # column 이름 바꾸기
df.head()

Unnamed: 0,answer_text,context,question
0,National Education Longitudinal Study,This study used data from the National Educati...,what is the dataset?
1,National Education Longitudinal Study,Any college degree attainment The study author...,what is the dataset?
2,National Education Longitudinal Study,The study author collected information on bach...,what is the dataset?
3,National Education Longitudinal Study,Dropping out of high school is not necessarily...,what is the dataset?
4,National Education Longitudinal Study,There are a number of ways to define dropouts....,what is the dataset?


Tokenizer

In [32]:
tokenizer = T5Tokenizer.from_pretrained("../input/t5base")
#../input/t5trainbysh
#../input/t5base

In [33]:
sample_text = df[["context"]].iloc[0]
sample_encoding = tokenizer(sample_text[0])
print("check the tokenized sample")
print(sample_encoding)
print()
print()
print("token length: ",len(sample_encoding["input_ids"]))

check the tokenized sample
{'input_ids': [100, 810, 261, 331, 45, 8, 868, 2855, 3230, 155, 76, 26, 10270, 9165, 41, 4171, 7600, 10, 4060, 61, 12, 5443, 8, 1951, 13, 7013, 17938, 1356, 21, 306, 496, 481, 30, 1900, 1952, 14568, 297, 5, 37, 810, 92, 2196, 823, 8, 11737, 13, 7013, 17938, 1356, 130, 315, 21, 166, 3381, 1900, 481, 3, 8911, 481, 3, 2544, 1362, 141, 5526, 44, 709, 128, 1900, 5, 86, 811, 6, 3, 9, 3, 29672, 1693, 2279, 30, 8, 1113, 13, 315, 6201, 13, 7013, 17938, 503, 18, 14867, 11, 1900, 1952, 14568, 297, 5, 17338, 17938, 1356, 462, 1900, 18, 4563, 1036, 2704, 21, 306, 496, 481, 5, 37, 1356, 462, 1900, 2996, 11, 87, 127, 8, 1004, 12, 3807, 1900, 11893, 21, 481, 298, 341, 16, 306, 496, 5, 37, 7897, 563, 16, 8, 810, 47, 9418, 26, 13, 3, 4171, 7600, 3008, 113, 5526, 3, 9, 442, 12091, 1208, 496, 11, 113, 10627, 16, 3, 9, 7013, 17938, 478, 298, 16, 306, 496, 41, 29, 3274, 505, 2079, 137, 37, 810, 2291, 261, 813, 3801, 485, 2604, 8150, 2254, 12, 482, 3, 9, 4993, 563, 13, 3, 4171, 760

In [34]:
class KaggleDataset(Dataset):
    def __init__(self, data, tokenizer, source_max_token_len=512, target_max_token_len=32):
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        
        source_encoding = tokenizer(
            data_row["question"],
            data_row["context"],
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        target_encoding = tokenizer(
            data_row["answer_text"],
            max_length=self.target_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100  # padding token -100으로 처리 -> 아마 학습때 신경 안쓰려고 처리하는 값
        
        return dict(
            question=data_row["question"],
            context=data_row["context"],
            answer_text=data_row["answer_text"],
            input_ids=source_encoding["input_ids"].flatten(),
            attention_mask=source_encoding["attention_mask"].flatten(),
            labels=labels.flatten()
        )

In [35]:
train_df, val_df = train_test_split(df, test_size = 0.1)

In [36]:
class KaggleDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        source_max_token_len: int = 396,
        target_max_token_len: int = 32
    ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self):
        self.train_dataset = KaggleDataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

        self.test_dataset = KaggleDataset(
            self.test_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=4
        )

In [37]:
BATCH_SIZE = 8
N_EPOCHS = 12

data_module = KaggleDataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [38]:
class KaggleModel(pl.LightningModule): 

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("../input/t5base/", return_dict = True)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        return output.loss, output.logits


    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [39]:
model = KaggleModel()

Inference

In [40]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [41]:
trained_model = KaggleModel.load_from_checkpoint("../input/t5trainbysh/best-checkpoint.ckpt").to(device)

In [42]:
trained_model.freeze()

In [43]:
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
sample_sub.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,
1,2f392438-e215-4169-bebf-21ac4ff253e1,
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,


In [44]:
tqdm.pandas()
sample_sub['text'] = sample_sub["Id"].progress_apply(partial(read_append_return, train_files_path=test_files_path))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [45]:
sample_sub.head()

Unnamed: 0,Id,PredictionString,text
0,2100032a-7c33-4bff-97ef-690822c43466,,sectio...
1,2f392438-e215-4169-bebf-21ac4ff253e1,,sectio...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,,sectio...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,,sectio...


In [46]:
def generate_answer(question):
    
    source_encoding = tokenizer(
        [que for que in question['question']],
        [con for con in question['context']],
        max_length=396,
        padding='max_length',
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    
    source_encoding.to(device)
    generated_ids = trained_model.model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        num_beams=1,
        max_length=80,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )
    

    preds = [
             tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             for generated_id in generated_ids
    ]
    
    return preds

In [47]:
"""
배치 처리해서 빠른 코드
"""
start = time.time()
answers = []
for i in range(len(sample_sub)):
    tmp = sample_sub['text'].iloc[i]
    tmp['question'] = pd.Series(["what is the dataset?"]*len(tmp))
    tmp.columns = ('section_title', 'context', 'question')
    answers.append(generate_answer(tmp))
print("base line time: ", time.time()-start)

base line time:  3.5348739624023438


In [48]:
answers_copy = list(map(set, answers))
answers_copy

[{"Alzheimer's Disease Neuroimaging Initiative (ADNI)",
  'Baltimore Longitudinal Study of Aging (BLSA)',
  'None'},
 {'Common Core of Data',
  'None',
  'Program for the International Assessment of Adult Competencies',
  'Trends in International Mathematics and Science Study'},
 {'None', 'Sea, Lake, and Overland Surges from Hurricanes'},
 {'None'}]

In [49]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [50]:
predictions = []
for i in range(len(answers_copy)):
    tmp = sorted(list(answers_copy[i]))
    if "None" in tmp:
        tmp.remove("None")
    cleaned_tmp = list(map(clean_text, tmp))
    #lists = []
    #pred_i = lists.append(cleaned_tmp) #'|'.join(cleaned_tmp)
    predictions.append(cleaned_tmp)

In [51]:
final_predictions = []

if ALL_BLENDED:
    for literal_match, mlm_pred, lables_match in zip(literal_preds, pred_mlm_labels, lables_list):
        temp = [literal_match, mlm_pred, lables_match]
        temp = [pred for pred in temp if pred]
        temp = ('|').join(temp)
        final_predictions.append(temp)
        
elif BASELINE_HELPING:
    for literal_match, mlm_pred, lables_match in zip(literal_preds, pred_mlm_labels, lables_list):
        if literal_match:
            final_predictions.append(literal_match)
        elif lables_match:
            final_predictions.append(lables_match)
        else:
            final_predictions.append(mlm_pred)

elif MATCH_ONLY:
    final_predictions = literal_preds

elif MLM_ONLY:
    final_predictions = pred_mlm_labels

elif THEO_MERGE:
    for i in range(len(literal_preds)):
        pred_naive = literal_preds[i].split('|')
        pred_model = pred_mlm_labels[i].split('|')
        pred_model_kept = []
        for pred_m in pred_model:
            kept = True
            for pred_n in pred_naive:
                if pred_m in pred_n or pred_n in pred_m:
                    kept = False
            if kept:
                pred_model_kept.append(pred_m)
        final_predictions.append("|".join(pred_naive + pred_model_kept))

else:
    for literal_match, mlm_pred in zip(literal_preds, pred_mlm_labels):
        if literal_match:
            final_predictions.append(literal_match)
        else:
            final_predictions.append(mlm_pred)

sample_submission['PredictionString'] = final_predictions

In [52]:
for i in range(len(sample_submission)):
    for j in range(len(predictions[i])):
        if predictions[i][j] in final_predictions[i]:
            pass
        else:
            final_predictions[i] = final_predictions[i] + '|' + predictions[i][j]


In [53]:
predictions #t5

[['alzheimer s disease neuroimaging initiative adni',
  'baltimore longitudinal study of aging blsa'],
 ['common core of data',
  'program for the international assessment of adult competencies',
  'trends in international mathematics and science study'],
 ['sea lake and overland surges from hurricanes'],
 []]

In [54]:
for i in range(len(sample_submission)):
    for j in range(len(pred_mlm_labels[i])):
        if pred_mlm_labels[i][j] in final_predictions[i]:
            pass
        else:
            final_predictions[i] = final_predictions[i] + '|' + pred_mlm_labels[i][j]
    

In [55]:
final_predictions

['adni|alzheimer s disease neuroimaging initiative adni|pubmed|baltimore longitudinal study of aging blsa|lothian birth cohort study lbc1936',
 'common core of data|nces common core of data|trends in international mathematics and science study|schools and staffing survey|integrated postsecondary education data system|ipeds|progress in international reading literacy study|program for the international assessment of adult competencies',
 'slosh model|noaa storm surge inundation|sea lake and overland surges from hurricanes',
 'rural urban continuum codes']

## Aggregate final predictions and write submission file

In [56]:
sample_submission['PredictionString'] = final_predictions
sample_submission[['Id', 'PredictionString']].to_csv('submission.csv', index=False)

sample_submission.head()

Unnamed: 0,Id,PredictionString,text
0,2100032a-7c33-4bff-97ef-690822c43466,adni|alzheimer s disease neuroimaging initiati...,Cognitive deficits and reduced educational ach...
1,2f392438-e215-4169-bebf-21ac4ff253e1,common core of data|nces common core of data|t...,This report describes how the education system...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model|noaa storm surge inundation|sea la...,"Cape Hatteras National Seashore (CAHA), locate..."
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes,A significant body of research has been conduc...


In [57]:
sample_submission.loc[0,'PredictionString']

'adni|alzheimer s disease neuroimaging initiative adni|pubmed|baltimore longitudinal study of aging blsa|lothian birth cohort study lbc1936'

In [58]:
sample_submission.loc[1,'PredictionString']

'common core of data|nces common core of data|trends in international mathematics and science study|schools and staffing survey|integrated postsecondary education data system|ipeds|progress in international reading literacy study|program for the international assessment of adult competencies'

In [59]:
sample_submission.loc[2,'PredictionString']

'slosh model|noaa storm surge inundation|sea lake and overland surges from hurricanes'

In [60]:
sample_submission.loc[3,'PredictionString']

'rural urban continuum codes'

# Evaluation Metric

In [61]:
# https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/discussion/230091
def compute_fbeta(y_true: List[List[str]],
                  y_pred: List[List[str]],
                  beta: float = 0.5) -> float:
    """Compute the Jaccard-based micro FBeta score.

    References
    ----------
    - https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation
    """

    def _jaccard_similarity(str1: str, str2: str) -> float:
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

    tp = 0  # true positive
    fp = 0  # false positive
    fn = 0  # false negative
    for ground_truth_list, predicted_string_list in zip(y_true, y_pred):
        predicted_string_list_sorted = sorted(predicted_string_list)
        for ground_truth in sorted(ground_truth_list):            
            if len(predicted_string_list_sorted) == 0:
                fn += 1
            else:
                similarity_scores = [
                    _jaccard_similarity(ground_truth, predicted_string)
                    for predicted_string in predicted_string_list_sorted
                ]
                matched_idx = np.argmax(similarity_scores)
                if similarity_scores[matched_idx] >= 0.5:
                    predicted_string_list_sorted.pop(matched_idx)
                    tp += 1
                else:
                    fn += 1
        fp += len(predicted_string_list_sorted)

    tp *= (1 + beta ** 2)
    fn *= beta ** 2
    fbeta_score = tp / (tp + fp + fn)
    return fbeta_score

In [62]:
if COMPUTE_CV:
    COMPUTE_CV_SCORE = compute_fbeta(sample_submission['cleaned_label'].apply(lambda x: [x]),\
                  sample_submission['PredictionString'].apply(lambda x: x.split('|')))
    print('COMPUTE_CV_SCORE =', COMPUTE_CV_SCORE)
else:
    print(f'COMPUTE_CV = {COMPUTE_CV}')
     
print(f'EDA_DEMO = {EDA_DEMO}')
print(f'ALL_BLENDED = {ALL_BLENDED}')
print(f'BASELINE_HELPING = {BASELINE_HELPING}')
print(f'MATCH_ONLY = {MATCH_ONLY}')
print(f'MLM_ONLY = {MLM_ONLY}')
print(f'KEN_MATCHING = {KEN_MATCHING}')
print(f'BS_CLEANING = {BS_CLEANING}')
print(f'THEO_MERGE = {THEO_MERGE}')
print(f'SEED = {SEED}')

COMPUTE_CV = False
EDA_DEMO = False
ALL_BLENDED = False
BASELINE_HELPING = True
MATCH_ONLY = False
MLM_ONLY = False
KEN_MATCHING = True
BS_CLEANING = False
THEO_MERGE = False
SEED = 347
