In [1]:
import os
import re
import json
import glob
from collections import defaultdict

from functools import partial
import random
from pathlib import Path
from tqdm import tqdm

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import nltk
import string

# Spacy model
import spacy
from spacy.util import minibatch, compounding
#from spacy.training.example import Example # version 3 only

KAGGLE = True

## Preprocessing

In [3]:
if KAGGLE:
    from model.model import longest_consecutive_caps as LCC
    from model.model import KMP

    train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")
    sample_sub = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    train_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/train/"
    test_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/test/"

else:
    from model import longest_consecutive_caps as LCC
    from model import KMP
    
    train_df = pd.read_csv("dataset/train.csv")
    sample_sub = pd.read_csv('dataset/sample_submission.csv')
    train_fp = "dataset/train/"
    test_fp = "dataset/test/"

In [2]:
#!python -m spacy download en_core_web_trf

In [4]:
#train_df.head(5)

In [5]:
#train_df.info()

In [6]:
def read_append_return(filename, train_files_path=train_fp, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [7]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

  from pandas import Panel
100%|██████████| 19661/19661 [00:05<00:00, 3877.35it/s]CPU times: user 4.19 s, sys: 900 ms, total: 5.09 s
Wall time: 5.08 s



In [8]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    #text = re.sub(' +', ' ', str(text).lower()).strip()
    text = re.sub('[^A-Za-z0-9 ]+', '|', str(text).lower())
    #text = ''.join([k for k in text if k not in string.punctuation])
    #text = re.sub('[^A-Za-z0-9.]+', ' ', str(text).lower()).strip()
#     text = re.sub("/'+/g", ' ', text)
    return text

In [9]:
%%time
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

100%|██████████| 19661/19661 [00:36<00:00, 539.98it/s]CPU times: user 35.6 s, sys: 1e+03 ms, total: 36.6 s
Wall time: 36.4 s



In [10]:
a = train_df["Id"].nunique()
b = train_df["Id"].size
print(a, b)
print(a/b)

14316 19661
0.7281420070189716


In [11]:
#train_df.head()

## Training
### Choose spaCy model and prepare training data

In [13]:
# training configurations
output_dir="output/"
n_iter = 25 # number of training iteration

# select model backbone
model = None 
#model = "specified" # specified for transformer + ner only
#model = "generic" # a model based on existing en_core_web_sm

token_anno = "entities" # note this may require changes for v3.0+ transformer models

In [14]:
TRAIN_DATA = []

for index, row in tqdm(train_df.iterrows()):
    # get text of each sample test
    train_text = row['text']
    row_id = row['Id']
    label = row['cleaned_label']
    m = len(label)
    text = train_text.lower().split('|')
    #text = re.split('[?.,;\n\t&!()]+', train_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin matching
    for sentence in text:
        # clean text 
        #sentence = clean_text(sentence)
        indexed = KMP(label, sentence)
        if indexed != []:
            n = len(sentence)
            elist = []
            for i in indexed:
                end = m + i
                a = ((end < n and sentence[end] == ' ') or end >= n) # can have NEGATIVE SAMPLING like "ADNI-2" (or are those negative ones?)
                b = ((i > 0 and sentence[i-1] == ' ') or i == 0)
                if a and b:
                    entity = (i, end, "DATASET") 
                    elist.append(entity)
            if elist == []: break # TEST (REMOVE IF NEEDED)
            x = (sentence, {token_anno:elist})
            TRAIN_DATA.append(x)

19661it [07:14, 45.24it/s]


### Configure training pipeline and train the spaCy model

Download official model with `python3 -m spacy download en_core_web_sm`

In [19]:
print("Using GPU:", spacy.prefer_gpu())

# load the model
if model == "specified":
    nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
elif model == "generic":
    nlp = spacy.load("en_core_web_sm")
    print("Loaded generic model en_core_web_sm")
elif model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

# add ner component to pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner') # version < 3.0
    nlp.add_pipe(ner, last=True) # verions < 3.0
    #nlp.add_pipe("ner", last=True) # version >= 3.0 only
    #ner = nlp.get_pipe("ner") # version >= 3.0 only
else:
    ner = nlp.get_pipe('ner')

# add all labels to the ner
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])
#ner.add_label("DATASET") # only one category so no for loop

# configure optimizer (may not work with spaCy v3 transformers)
optimizer = nlp.begin_training()

# configure pipeline components to disable
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

Using GPU: True
Loaded generic model en_core_web_sm


In [20]:
# start training
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) # HYPERPARAMETERS
        # version < 3.0 use thi
        for batch in tqdm(batches):
            texts, annotations = zip(*batch)
            nlp.update(
                texts, # or [texts] if not using batch
                annotations, # or [annotations] if not using batch
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        
        # For spaCy v3.0+ use the similar for loops in archived.ipynb
            
        print("\titeration %d, ner loss: %.2f"%(itn+1, losses['ner']))

2882it [01:56, 24.64it/s]
4it [00:00, 28.96it/s]	iteration 1, ner loss: 325564.31
2882it [01:53, 25.29it/s]
4it [00:00, 33.17it/s]	iteration 2, ner loss: 311908.03
2882it [01:54, 25.22it/s]
3it [00:00, 25.05it/s]	iteration 3, ner loss: 309088.46
2882it [01:54, 25.21it/s]
4it [00:00, 32.83it/s]	iteration 4, ner loss: 308337.13
2882it [01:55, 25.04it/s]
3it [00:00, 25.71it/s]	iteration 5, ner loss: 307145.44
2882it [01:57, 24.51it/s]
3it [00:00, 28.95it/s]	iteration 6, ner loss: 306298.50
2882it [01:57, 24.48it/s]
3it [00:00, 28.49it/s]	iteration 7, ner loss: 306292.75
2882it [01:52, 25.53it/s]
4it [00:00, 31.77it/s]	iteration 8, ner loss: 305150.08
2882it [01:50, 26.04it/s]
4it [00:00, 32.66it/s]	iteration 9, ner loss: 305558.32
2882it [01:49, 26.21it/s]
4it [00:00, 30.91it/s]	iteration 10, ner loss: 305292.76
2882it [01:46, 26.98it/s]
4it [00:00, 32.66it/s]	iteration 11, ner loss: 304080.17
2882it [01:40, 28.63it/s]
4it [00:00, 35.42it/s]	iteration 12, ner loss: 304364.03
2882it [01:40

## Testing and Exporting Model

In [21]:
# sample test
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    break

# save model
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to output


## Inference with Trained Model
Generates the submission file

In [22]:
def clean_text(txt):
    ''' DO NOT DELETE: Official function for submission text cleaning '''
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
    #return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [23]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_fp))

  from pandas import Panel
100%|██████████| 4/4 [00:00<00:00, 1164.52it/s]CPU times: user 4.64 ms, sys: 3.99 ms, total: 8.63 ms
Wall time: 7.43 ms



In [24]:
# STRING MATCHING BLOCK
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
lables_list = []

# load model 'EN'
#nlp2 = spacy.load(output_dir) # loading an model can be slower?

for index, row in tqdm(sample_sub.iterrows()):
    # get text of each sample test
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]

    cleaned_labels = temp_df['cleaned_label'].to_list()

    texts = sample_text.lower().split('.')
    #text = re.split('[?.,;\n\t&!]', sample_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin search
    # matching
    
    for known_label in existing_labels:   # for each label in the known set
        # EXACT MATCH
        if known_label in sample_text.lower():   # find the EXACT label in text 
            cleaned_labels.append(clean_text(known_label)) # if found, then append to the list for further formatting
    
    # THIS METHOD BELOW IS MUCH FASTER!!!!!
    # SOURCE: https://spacy.io/usage/processing-pipelines
    # SOURCE 2: https://prrao87.github.io/blog/spacy/nlp/performance/2020/05/02/spacy-multiprocess.html
    # THANK YOU SOOO MUCH!!!
    # Disabling pipeline components also helps
    for doc in nlp.pipe(texts, disable=other_pipes):
        cleaned_labels.extend([clean_text(entity.text) for entity in doc.ents if entity.label_ == "DATASET"])
    

    #for sentence in texts:   
        #doc = nlp(text_cleaning(sentence))
    #    doc = nlp2(text_cleaning(sentence))

    #    for entity in doc.ents:
    #        if entity.label_ == 'DATASET':
    #            cleaned_labels.append(clean_text(entity.text))   
            
        # CASE 1: FUZZYMATCH (see archived)

        # CASE 2: Consecutive Capitalizations (see archived)
        
        
    #cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

4it [00:01,  2.80it/s]


In [25]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list

In [26]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
submission.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,106k|5 000|alzheimer s disease neuroimaging in...
1,2f392438-e215-4169-bebf-21ac4ff253e1,higher education figure|nces common core of da...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh point|slosh model|coastal management coo...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes|program participan...


In [27]:
submission.to_csv('submission.csv', index=False)

In [28]:
for item in submission["PredictionString"]:
    print(item)
    print()

106k|5 000|alzheimer s disease neuroimaging initiative adni|adni|alzheimer s disease neuroimaging initiative adni 

higher education figure|nces common core of data|education e|higher education 6 upper secondary education|mathematics teacher and the science teacher|world arguments|early childhood education|common core of data|program for international student|education qualifications|higher education|behavioral sciences|program taken in higher education departments|national education systems|high school|education in france|education science|school teacher salaries|higher education level|high school for upper secondary school|trends in international mathematics and science study|education figure|education in canada|program focusing on research and taken|beginning of the school year|education 6|trends in teacher preparation|higher education had higher|education and

slosh point|slosh model|coastal management coordinates|coastal observation station|slosh grid|coastal change science along 