In [1]:
!pip install simpletransformers --no-index --find-links ../input/colleridge/packages
!pip install ../input/colleridge/packages/fsspec-2021.4.0-py3-none-any.whl --force-reinstall

Looking in links: ../input/colleridge/packages
Processing /kaggle/input/colleridge/packages/simpletransformers-0.61.4-py3-none-any.whl
Processing /kaggle/input/colleridge/packages/seqeval-1.2.2-py3-none-any.whl
Processing /kaggle/input/colleridge/packages/datasets-1.6.1-py3-none-any.whl
Processing /kaggle/input/colleridge/packages/streamlit-0.80.0-py2.py3-none-any.whl
Processing /kaggle/input/colleridge/packages/huggingface_hub-0.0.8-py3-none-any.whl
Processing /kaggle/input/colleridge/packages/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/colleridge/packages/tqdm-4.49.0-py2.py3-none-any.whl
Processing /kaggle/input/colleridge/packages/watchdog-2.0.3-py3-none-manylinux2014_x86_64.whl
Processing /kaggle/input/colleridge/packages/astor-0.8.1-py2.py3-none-any.whl
Processing /kaggle/input/colleridge/packages/base58-2.1.0-py3-none-any.whl
Processing /kaggle/input/colleridge/packages/validators-0.18.2-py3-none-any.whl
Processing /kaggle/input/colleridg

In [2]:
import logging
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from simpletransformers.ner import NERModel, NERArgs

from tqdm import tqdm, notebook
import matplotlib.pyplot as plt

random.seed(123)
np.random.seed(456)

In [3]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.NOTSET)

## Loading the data

In [4]:
MAX_SAMPLE = None

In [5]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]

In [6]:
labels = train['cleaned_label'].unique()
labels

array(['national education longitudinal study', 'noaa tidal station',
       'slosh model', 'noaa c cap', 'aging integrated database agid ',
       'alzheimers disease neuroimaging initiative',
       'aging integrated database',
       'noaa national water level observation network',
       'noaa water level station',
       'baltimore longitudinal study of aging blsa ',
       'national water level observation network',
       'arms farm financial and crop production practices',
       'beginning postsecondary student',
       'noaa sea lake and overland surges from hurricanes',
       'noaa tide gauge',
       'the national institute on aging genetics of alzheimer s disease data storage site',
       'national center for education statistics common core of data',
       'national science foundation survey of industrial research and development',
       'baccalaureate and beyond',
       'noaa international best track archive for climate stewardship',
       'agricultural resource ma

In [7]:
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [8]:
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

test_papers = {}
paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        test_papers[paper_id] = paper

In [9]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [10]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def remove_similars(arr1, arr2):
    for s2 in arr2:
        if any(jaccard(filtered_one, s2) >= 0.5 for filtered_one in arr1):
            continue
        arr1.add(s2)
    return arr1



In [11]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())

print(f'No. different labels: {len(all_labels)}')
with open("../input/bigger-govt-dataset-list/data_set_26897.csv") as big_extra, open("../input/bigger-govt-dataset-list/data_set_800.csv") as small_extra:
#     big_extra_arr = set([clean_text(ds).strip() for ds in big_extra.readlines()[1:]])
#     all_labels.update(set(big_extra_arr))
    small_extra_arr = [clean_text(ds).strip() for ds in small_extra.readlines()[1:]]
    all_labels.update(set(small_extra_arr))
    pass
print(f'No. different labels: {len(all_labels)}')

No. different labels: 180
No. different labels: 2163


In [12]:
from pprint import pprint

In [13]:
pprint(all_labels)

{'2013 2014 phap associates by state',
 '2014 child and adult health care quality measures',
 '2015 child and adult health care quality measures',
 '2015 edition market readiness for hospitals and clinicians data',
 '2015 plan selections by zip code in the health insurance marketplace',
 '2016 child and adult health care quality measures',
 '2017 child and adult health care quality measures',
 '2017 managed care programs by state',
 '2018 child and adult health care quality measures',
 '2018 managed care programs by state',
 '2019 child and adult health care quality measures quality',
 '2019 ncov complete genome sequences',
 '2019 ncov genome sequence',
 '2019 ncov genome sequences',
 '2019-ncov complete genome sequences',
 '2019-ncov genome sequence',
 '2019-ncov genome sequences',
 '500 cities census tract boundaries',
 '500 cities census tract level data gis friendly format 2016 release',
 '500 cities census tract level data gis friendly format 2017 release',
 '500 cities census tra

In [14]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def clean_text_cased(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [15]:
literal_preds = []

for paper_id in sample_submission['Id']:
    paper = test_papers[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_text(label))         
    literal_preds.append('|'.join(labels))

In [16]:
literal_preds

['alzheimer s disease neuroimaging initiative adni |pubmed|adni|alzheimer s disease neuroimaging initiative adni',
 'schools and staffing survey|trends in international mathematics and science study|common core of data|progress in international reading literacy study|ipeds|nces common core of data|integrated postsecondary education data system',
 'slosh model|noaa storm surge inundation|sea lake and overland surges from hurricanes',
 'rural urban continuum codes']

In [17]:
def deduplicate(predicted_labels):
    filtered_pred_labels = []
    for labels in predicted_labels:
        filtered = []

        for label in sorted(labels, key=len):
            label = clean_text(label)
            if len(filtered) == 0 or all(jaccard(label, got_label) < 0.75 for got_label in filtered):
                filtered.append(label)

        filtered_pred_labels.append('|'.join(filtered))
    return filtered_pred_labels

In [18]:
def make_predictions(paper, model):    
    sentences = [clean_text_cased(sentence) for section in paper for sentence in section['text'].split(". ")]
    predictions, raw_output = model.predict(sentences)
    tokens = []
    tags = []
    paper_datasets_tokens = []
    paper_datasets_tags = []
    for pred in predictions:
        pred_tokens, pred_tags = [], []
        if "B-DS" in [list(single.values())[0] for single in pred]:
            for single in pred:
                if len({'B-DS', 'I-DS'}.intersection(set(single.values()))) > 0:
                    pred_tokens.append(list(single.keys())[0])
                    pred_tags.append(list(single.values())[0])
            if len(pred_tokens) > 0 and len(pred_tags) > 0:
                paper_datasets_tokens.append(pred_tokens)
                paper_datasets_tags.append(pred_tags)
    return deduplicate([[" ".join(pred) for pred in paper_datasets_tokens]])[0]
    

## Testing model

In [19]:
# Configure the model
model_args = NERArgs()
model_args.num_train_epochs = 4
model_args.classification_report = True
model_args.overwrite_output_dir = True
model_args.train_batch_size = 16
model_args.lazy_loading = True
model_args.max_seq_length = 128
model_args.reprocess_input_data = True
model_args.evaluate_during_training = True
model_args.eval_batch_size = 32
model_args.use_multiprocessing = True  # Set to false for prediction
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.0001
model_args.early_stopping_metric_minimize = True
model_args.early_stopping_patience = 5
model_args.evaluate_during_training_steps = 800
model_args.wandb_project = "SciBert-colleridge"
model_args.wandb_kwargs = {"resume": True}
model_args.learning_rate = 1e-5
model_args.custom_parameter_groups = [
    {
        "params": ["classifier.weight"],
        "lr": 3e-4,
    },
    {
        "params": ["classifier.bias"],
        "lr": 3e-4
    },
]

# trans_model = NERModel(
#     "auto", "../input/scibertcasedcolleridge/outputs", args=model_args, labels=["O", "B-DS", "I-DS"]
# )

trans_model = NERModel(
    "roberta", "../input/colleridge/outputs", args=model_args, labels=["O", "B-DS", "I-DS"]
)


loading configuration file ../input/colleridge/outputs/config.json
Model config RobertaConfig {
  "_name_or_path": "outputs/best_model",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ../input/colleridge/outputs/pytorch_model.bin
All model checkpoin

In [20]:
for i, (paper_id, paper) in enumerate(test_papers.items()):
    if literal_preds[i]:
        literal = literal_preds[i]
        print('literal @', literal)
        sample_submission.loc[sample_submission['Id'] == paper_id, 'PredictionString'] = literal
    else:
        model_prediction = make_predictions(paper, trans_model)
        print('model @', model_prediction)
        sample_submission.loc[sample_submission['Id'] == paper_id, 'PredictionString'] = model_prediction

literal @ alzheimer s disease neuroimaging initiative adni |pubmed|adni|alzheimer s disease neuroimaging initiative adni
literal @ schools and staffing survey|trends in international mathematics and science study|common core of data|progress in international reading literacy study|ipeds|nces common core of data|integrated postsecondary education data system
literal @ slosh model|noaa storm surge inundation|sea lake and overland surges from hurricanes
literal @ rural urban continuum codes


In [21]:
sample_submission.to_csv("submission.csv", index=False)

In [22]:
sample_submission

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative ad...
1,2f392438-e215-4169-bebf-21ac4ff253e1,schools and staffing survey|trends in internat...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model|noaa storm surge inundation|sea la...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes
