In [1]:
import os
import re
import json
import csv
import glob
import random
import time
from collections import defaultdict

from functools import partial
from pathlib import Path
from tqdm import tqdm

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import nltk
import string

# spaCy 
import spacy
from spacy.util import minibatch, compounding
#from spacy.training.example import Example # version 3 only

In [2]:
# choose how to run
KAGGLE = False
TRAINING = False

In [3]:
start_time = time.time()

## Preprocessing

In [4]:
if KAGGLE:
    from model.model import longest_consecutive_caps as LCC
    from model.model import KMP

    train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")
    sample_sub = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    train_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/train/"
    test_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/test/"
    gvnt_dataset_path = "/kaggle/input/bigger-govt-dataset-list/data_set_800.csv"

    model_dir = "/kaggle/input/pretrained-models/output/" # where to store trained model or load pretrained model

else:
    from model import longest_consecutive_caps as LCC
    from model import KMP
    
    train_df = pd.read_csv("dataset/train.csv")
    sample_sub = pd.read_csv('dataset/sample_submission.csv')
    train_fp = "dataset/train/"
    test_fp = "dataset/test/"
    gvnt_dataset_path = "dataset/gvnt_800.csv"

    model_dir = "output/"

In [5]:
#!python -m spacy download en_core_web_trf

In [6]:
def read_append_return(filename, train_files_path=train_fp, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [7]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

  from pandas import Panel
100%|██████████| 19661/19661 [00:05<00:00, 3582.95it/s]CPU times: user 4.57 s, sys: 940 ms, total: 5.51 s
Wall time: 5.49 s



In [8]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    #text = re.sub(' +', ' ', str(text).lower()).strip()
    text = re.sub('[^A-Za-z0-9 ]+', '|', str(text).lower())
    #text = ''.join([k for k in text if k not in string.punctuation])
    #text = re.sub('[^A-Za-z0-9.]+', ' ', str(text).lower()).strip()
#     text = re.sub("/'+/g", ' ', text)
    return text

In [9]:
%%time
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

100%|██████████| 19661/19661 [00:36<00:00, 532.04it/s] CPU times: user 35.7 s, sys: 1.44 s, total: 37.1 s
Wall time: 37 s



In [10]:
a = train_df["Id"].nunique()
b = train_df["Id"].size
print(a, b)
print(a/b)

14316 19661
0.7281420070189716


In [11]:
#train_df.head()

## Training
If `TRAINING` is enabled, choose spaCy model and prepare training data

In [12]:
# TRAINING CONFIGURATIONS

# select model backbone
model = None 
#model = "specified"            # specified for transformer + ner only
#model = "generic"              # a model based on existing en_core_web_sm

# data preparation configurations
token_anno = "entities"         # note this may require changes for v3.0+ transformer models
negative_sample = True          # False: only prepares X_1; True: also prepares X_0
neg_sample_rate = 0.005         # only this portion of negative samples will be added to training set

# training configurations
n_iter = 15 # number of training iteration
minibatch_min = 4.0
minibatch_max = 32.0
minibatch_int = 1.001
droprate = 0.5

In [13]:
if TRAINING:
    TRAIN_DATA = []
    x1, x2, x3, x4 = 0, 0, 0, 0

    print("Preparing training data...")

    for index, row in tqdm(train_df.iterrows()):
        # get text of each sample test
        train_text = row['text']
        row_id = row['Id']
        label = row['cleaned_label']
        m = len(label)
        text = train_text.lower().split('|')
        #text = re.split('[?.,;\n\t&!()]+', train_text) # can't have sample_text.lower() since I need to find consecutive caps

        # begin matching
        for sentence in text:
            x1 += 1
            # clean text 
            #sentence = clean_text(sentence)
            indexed = KMP(label, sentence)
            if indexed != []:
                n = len(sentence)
                elist = []
                for i in indexed:
                    end = m + i
                    a = ((end < n and sentence[end] == ' ') or end >= n) # can have NEGATIVE SAMPLING like "ADNI-2" (or are those negative ones?)
                    b = ((i > 0 and sentence[i-1] == ' ') or i == 0)
                    if a and b:
                        entity = (i, end, "DATASET") 
                        elist.append(entity)
                        x3 += 1
                if elist == []: break # TEST (REMOVE IF NEEDED)
                x = (sentence, {token_anno:elist})
                TRAIN_DATA.append(x)
            
            elif negative_sample: 
                x2 += 1
                if random.random() < neg_sample_rate:
                    x4 += 1
                    TRAIN_DATA.append((sentence, {token_anno:[]})) # TEST

In [14]:
if TRAINING:
    print("[INFO]")
    print("Among %d sentences,"%x1)
    print("%d sentences have no title in them,"%x2)
    print("meaning %.3f%% do not have title."%(x2/x1*100))
    print("So we have %d sentences with titles."%(x1-x2))
    print("In %d sentences, we obtained %d tokens (positive samples) that perfectly match titles."%(x1-x2, x3))
    print("For sentences without any title, we chose %d negative samples."%x4)
    print("The ratio of positive vs negative samples is %.2f : 100."%(x3 / x4 * 100))

### Configure training pipeline and train the spaCy model

Download official model with `python3 -m spacy download en_core_web_sm`

In [15]:
if TRAINING:
    print("Using GPU:", spacy.prefer_gpu())

    # load the model
    if model == "specified":
        nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
    elif model == "generic":
        nlp = spacy.load("en_core_web_sm")
        print("Loaded generic model en_core_web_sm")
    elif model is not None:
        nlp = spacy.load(model)  
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  
        print("Created blank 'en' model")

    # add ner component to pipeline
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner') # version < 3.0
        nlp.add_pipe(ner, last=True) # verions < 3.0
        #nlp.add_pipe("ner", last=True) # version >= 3.0 only
        #ner = nlp.get_pipe("ner") # version >= 3.0 only
    else:
        ner = nlp.get_pipe('ner')

    # add all labels to the ner
    #for _, annotations in TRAIN_DATA:
    #    for ent in annotations.get('entities'):
    #        ner.add_label(ent[2])
    ner.add_label("DATASET") # only one category so no for loop

    # configure optimizer (may not work with spaCy v3 transformers)
    optimizer = nlp.begin_training()

    # configure pipeline components to disable
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [16]:
if TRAINING:
    # start training
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(minibatch_min, minibatch_max, minibatch_int)) # HYPERPARAMETERS
            # version < 3.0 use this
            for batch in tqdm(batches):
                texts, annotations = zip(*batch)
                nlp.update(
                    texts, # or [texts] if not using batch
                    annotations, # or [annotations] if not using batch
                    drop=droprate,  
                    sgd=optimizer,
                    losses=losses)
            
            # For spaCy v3.0+ use the similar for loops in archived.ipynb
                
            print("\titeration %d, ner loss: %.2f"%(itn+1, losses['ner']))

## Testing and Exporting Model

In [17]:
if TRAINING:
    # sample test
    #i = 0
    #for text, _ in TRAIN_DATA:
    #    doc = nlp(text)
    #    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    #    i += 1
    #    if i > 50: break

    # save model
    if model_dir is not None:
        model_dir = Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir()
        nlp.to_disk(model_dir)
        print("Saved model to", model_dir)

## Inference with Trained Model
Generates the submission file

In [18]:
def clean_text(txt):
    ''' DO NOT DELETE: Official function for submission text cleaning '''
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [19]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_fp))

100%|██████████| 4/4 [00:00<00:00, 1115.65it/s]CPU times: user 7.79 ms, sys: 325 µs, total: 8.12 ms
Wall time: 6.98 ms



In [20]:
def load_gvnt_dataset():
    with open(gvnt_dataset_path) as f:
        reader = csv.reader(f)
        my_list = list(reader)
    dataset = [row[0] for row in my_list][1:]
    return dataset

In [21]:
# STRING MATCHING BLOCK
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]
gvnt_set = load_gvnt_dataset()

existing_labels = set(temp_1 + temp_2 + temp_3 + gvnt_set)
id_list = []
lables_list = []

# load model 'EN'
if not TRAINING:
    print("Using GPU:", spacy.prefer_gpu())
    nlp = spacy.load(model_dir) # loading an model can be slower?
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']


for index, row in tqdm(sample_sub.iterrows()):
    # get text of each sample test
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]

    cleaned_labels = temp_df['cleaned_label'].to_list()

    texts = sample_text.lower().split('.')
    #text = re.split('[?.,;\n\t&!]', sample_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin search
    # matching
    
    for known_label in existing_labels:   # for each label in the known set
        # EXACT MATCH
        if known_label in sample_text.lower():   # find the EXACT label in text 
            cleaned_labels.append(clean_text(known_label)) # if found, then append to the list for further formatting
    
    # THIS METHOD BELOW IS MUCH FASTER!!!!!
    # SOURCE: https://spacy.io/usage/processing-pipelines
    # SOURCE 2: https://prrao87.github.io/blog/spacy/nlp/performance/2020/05/02/spacy-multiprocess.html
    # THANK YOU SOOO MUCH!!!
    # Disabling pipeline components also helps
    for doc in nlp.pipe(texts, disable=other_pipes):
        cleaned_labels.extend([clean_text(entity.text) for entity in doc.ents if entity.label_ == "DATASET"])
    

    #for sentence in texts:   
        #doc = nlp(text_cleaning(sentence))
    #    doc = nlp2(text_cleaning(sentence))

    #    for entity in doc.ents:
    #        if entity.label_ == 'DATASET':
    #            cleaned_labels.append(clean_text(entity.text))   
            
        # CASE 1: FUZZYMATCH (see archived)

        # CASE 2: Consecutive Capitalizations (see archived)
        
        
    #cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

Using GPU: True
4it [00:05,  1.33s/it]


In [22]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list

In [23]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
submission.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative ad...
1,2f392438-e215-4169-bebf-21ac4ff253e1,education 3 to|nces common core of data|trends...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh display program|8 00|slosh and|slosh poi...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


In [24]:
submission.to_csv('submission.csv', index=False)

In [25]:
#for item in submission["PredictionString"]:
#    print(item)
#    print()

In [26]:
end_time = time.time()
print("Total time spent: %.2f seconds"%(end_time-start_time))

Total time spent: 61.73 seconds
