In [1]:
import os
import re
import json
import glob
from collections import defaultdict
from functools import partial

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
#from tqdm.autonotebook import tqdm

import nltk
import string
from fuzzywuzzy import fuzz

from model import longest_consecutive_caps as LCC
from model import KMP
#from model.model import longest_consecutive_caps as LCC
#from model.model import KMP


# Spacy model
import spacy
#from __future__ import unicode_literals, print_function
#import plac
import random
from pathlib import Path
from tqdm import tqdm
#from spacy.training.example import Example # version 3 only


output_dir="output/"
model = None 
#model = "specified" # specified for transformer + ner only
n_iter = 10 # number of training iteration



In [2]:
#!python -m spacy download en_core_web_trf

In [3]:
train_df = pd.read_csv("dataset/train.csv")
sample_sub = pd.read_csv('dataset/sample_submission.csv')
train_fp = "dataset/train/"
test_fp = "dataset/test/"

#train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")
#sample_sub = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
#train_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/train/"
#test_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/test/"

In [4]:
#train_df.head(5)

In [5]:
#train_df.info()

In [6]:
def read_append_return(filename, train_files_path=train_fp, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [7]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

  from pandas import Panel
100%|██████████| 19661/19661 [00:06<00:00, 3200.50it/s]CPU times: user 5.01 s, sys: 1.13 s, total: 6.14 s
Wall time: 6.15 s



In [8]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    #text = re.sub(' +', ' ', str(text).lower()).strip()
    text = re.sub('[^A-Za-z0-9 ]+', '|', str(text).lower())
    #text = ''.join([k for k in text if k not in string.punctuation])
    #text = re.sub('[^A-Za-z0-9.]+', ' ', str(text).lower()).strip()
#     text = re.sub("/'+/g", ' ', text)
    return text

In [9]:
%%time
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

100%|██████████| 19661/19661 [00:43<00:00, 451.58it/s]CPU times: user 42.4 s, sys: 1.24 s, total: 43.7 s
Wall time: 43.6 s



In [10]:
a = train_df["Id"].nunique()
b = train_df["Id"].size
print(a, b)
print(a/b)

14316 19661
0.7281420070189716


In [11]:
#train_df.head()

### Prepare training data

In [13]:
TRAIN_DATA = []
if model == "specified":
    token_anno = "entities"
else:
    token_anno = "entities"


for index, row in tqdm(train_df.iterrows()):
    # get text of each sample test
    train_text = row['text']
    row_id = row['Id']
    label = row['cleaned_label']
    m = len(label)
    text = train_text.lower().split('|')
    #text = re.split('[?.,;\n\t&!()]+', train_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin matching
    for sentence in text:
        # clean text 
        #sentence = clean_text(sentence)
        indexed = KMP(label, sentence)
        if indexed != []:
            n = len(sentence)
            elist = []
            for i in indexed:
                end = m+i
                a = ((end < n and sentence[end] == ' ') or end >= n) # can have NEGATIVE SAMPLING like "ADNI-2" (or are those negative ones?)
                b = ((i > 0 and sentence[i-1] == ' ') or i == 0)
                if a and b:
                    entity = (i, end, "DATASET") 
                    elist.append(entity)
            if elist == []: break # TEST (REMOVE IF NEEDED)
            x = (sentence, {token_anno:elist})
            TRAIN_DATA.append(x)

19661it [08:02, 40.76it/s]


### Train the spaCy model

In [23]:
from spacy.util import minibatch, compounding
#load the model
n_iter = 25 # number of training iteration (overriding n_iter declared in cell 1)
print(spacy.prefer_gpu())

if model == "specified":
    nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
elif model == "generic":
    nlp = spacy.load("en_core_web_sm")
elif model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")


#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner') # version < 3.0
    nlp.add_pipe(ner, last=True) # verions < 3.0
    #nlp.add_pipe("ner", last=True) # version >= 3.0 only
    #ner = nlp.get_pipe("ner") # version >= 3.0 only
else:
    ner = nlp.get_pipe('ner')

#for _, annotations in TRAIN_DATA:
#    for ent in annotations.get('entities'):
#        ner.add_label(ent[2])
ner.add_label("DATASET") # only one category so no for loop


other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    if model == None: 
        optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) # HYPERPARAMETERS
        for batch in tqdm(batches):
            texts, annotations = zip(*batch)
            nlp.update(
                texts, # or [texts] if not using batch
                annotations, # or [annotations] if not using batch
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        #for batch in tqdm(batches):
        #for texts, annotations in tqdm(TRAIN_DATA):
        #    for texts, annotations in batch:
        #        doc = nlp.make_doc(texts) # version >= 3
        #        example = Example.from_dict(doc, annotations) # version >= 3
        #        nlp.update(
        #            [example],
        #            drop=0.5, 
        #            sgd=optimizer,
        #            losses=losses) # version >= 3
            
        print("\niteration", itn+1,"ner loss", round(losses['ner'], 3))

True
Created blank 'en' model
  proc.begin_training(
  proc.begin_training(
2882it [01:52, 25.66it/s]
3it [00:00, 26.44it/s]
iteration 1 ner loss 13404.515
2882it [01:51, 25.94it/s]
3it [00:00, 29.84it/s]
iteration 2 ner loss 8051.641
2882it [01:51, 25.94it/s]
3it [00:00, 25.31it/s]
iteration 3 ner loss 7739.962
2882it [01:50, 25.97it/s]
3it [00:00, 28.08it/s]
iteration 4 ner loss 7509.122
2882it [01:47, 26.73it/s]
4it [00:00, 31.12it/s]
iteration 5 ner loss 7297.483
2882it [01:48, 26.59it/s]
3it [00:00, 27.98it/s]
iteration 6 ner loss 7195.442
2882it [01:42, 28.06it/s]
3it [00:00, 29.20it/s]
iteration 7 ner loss 7043.471
2882it [01:42, 28.02it/s]
4it [00:00, 33.02it/s]
iteration 8 ner loss 6935.929
2882it [01:39, 28.93it/s]
4it [00:00, 34.58it/s]
iteration 9 ner loss 6658.808
2882it [01:35, 30.06it/s]
4it [00:00, 32.36it/s]
iteration 10 ner loss 6725.302
2882it [01:36, 30.00it/s]
4it [00:00, 34.12it/s]
iteration 11 ner loss 6642.548
2882it [01:36, 29.97it/s]
4it [00:00, 35.54it/s]
ite

In [15]:
# sample test
#for text, _ in TRAIN_DATA:
#    doc = nlp(text)
#    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
#    break

# save model
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to output


### Test the model

In [16]:
def clean_text(txt):
    ''' DO NOT DELETE: Official function for submission text cleaning '''
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
    #return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [17]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_fp))

  from pandas import Panel
100%|██████████| 4/4 [00:00<00:00, 1064.41it/s]CPU times: user 8.74 ms, sys: 305 µs, total: 9.05 ms
Wall time: 7.55 ms



In [18]:
# STRING MATCHING BLOCK
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
lables_list = []
# load model 'EN'
nlp2 = spacy.load(output_dir)
for index, row in tqdm(sample_sub.iterrows()):
    # get text of each sample test
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]

    cleaned_labels = temp_df['cleaned_label'].to_list()

    text = sample_text.lower().split('.')
    #text = re.split('[?.,;\n\t&!]', sample_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin search
    # matching
    
    for known_label in existing_labels:   # for each label in the known set
        # EXACT MATCH
        if known_label in sample_text.lower():   # find the EXACT label in text 
            cleaned_labels.append(clean_text(known_label)) # if found, then append to the list for further formatting
            
    for sentence in text:
        doc = nlp2(text_cleaning(sentence))

        for entity in doc.ents:
            if entity.label_ == 'DATASET':
                cleaned_labels.append(clean_text(entity.text))   
            
        # CASE 1: FUZZY MATCH
        #value = fuzz.partial_ratio(sentence.lower(), known_label) # I moved .lower() here
        #if value > 85 and value < 100:
            # print('value: ', str(value), known_label) # Alex, you might wanna see what this prints
            # cleaned_labels.append(clean_text(known_label))
    
        # CASE 2: for unknown labels
        # sentence filtering (Longest Consecutive Capitalization)
        #print(sentence)
#             length, rate, filtered_sentence = LCC(sentence)
#             if rate <= 0 or length == 0 or (length == 1 and not sentence.isupper()): 
#                 continue # no consecutive caps found
#             # <insert classifier here>
#             else:
#                 for keyword in ["dataset", "data", "database", "survey", "study", "research", "statistics"]:
#                     if keyword in filtered_sentence.lower():
#                         #pass
#                         cleaned_labels.append(clean_text(filtered_sentence)) # naive
        
    #cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

4it [00:14,  3.74s/it]


In [19]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list

In [20]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
submission.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative ad...
1,2f392438-e215-4169-bebf-21ac4ff253e1,students in science literacy|common core of da...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,sea lake and overland surges from hurricanes|n...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


In [21]:
submission.to_csv('submission.csv', index=False)

In [22]:
#for item in submission["PredictionString"]:
#    print(item)
#    print()