In [1]:
import os
import re
import json
import glob
from collections import defaultdict
from functools import partial

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
#from tqdm.autonotebook import tqdm

import nltk
import string
from fuzzywuzzy import fuzz

from model import longest_consecutive_caps as LCC
from model import KMP
# from model.model import longest_consecutive_caps as LCC
#from model.model import KMP


# Spacy model
import spacy
#from __future__ import unicode_literals, print_function
#import plac
import random
from pathlib import Path
from tqdm import tqdm
from spacy.training.example import Example


output_dir="output/"
model = None 
#model = "specified" # specified for transformer + ner only
n_iter = 10 # number of training iteration



In [2]:
#!python -m spacy download en_core_web_trf

In [3]:
train_df = pd.read_csv("dataset/train.csv")
sample_sub = pd.read_csv('dataset/sample_submission.csv')
train_fp = "dataset/train/"
test_fp = "dataset/test/"

# train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")
# sample_sub = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
# train_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/train/"
# test_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/test/"

In [4]:
#train_df.head(5)

In [5]:
#train_df.info()

In [6]:
def read_append_return(filename, train_files_path=train_fp, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [7]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

  from pandas import Panel
100%|██████████| 19661/19661 [00:05<00:00, 3810.47it/s]

CPU times: user 4.34 s, sys: 845 ms, total: 5.18 s
Wall time: 5.17 s





In [8]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub(' +', ' ', str(text).lower()).strip()
    #text = ''.join([k for k in text if k not in string.punctuation])
    #text = re.sub('[^A-Za-z0-9\(\)]+', ' ', str(text).lower()).strip()
#     text = re.sub("/'+/g", ' ', text)
    return text

In [9]:
%%time
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

100%|██████████| 19661/19661 [00:57<00:00, 339.88it/s]

CPU times: user 55.5 s, sys: 2.65 s, total: 58.2 s
Wall time: 57.9 s





In [10]:
a = train_df["Id"].nunique()
b = train_df["Id"].size
print(a, b)
print(a/b)

14316 19661
0.7281420070189716


```
from model import search_sentences
from custom_classes import PosMap
```

```
%%time
tqdm.pandas()

i = 0
limit = 100
p = 5
q = 4
pre_tf = PosMap(p)
post_tf = PosMap(q)

for row in tqdm(train_df.iterrows()):
    label = row[1]["cleaned_label"]
    text = row[1]["text"].lower()
    act, deact = search_sentences(label, text, pre=p, post=q)

    for j in range(len(act)):
        pre_words = act[j].split()[::-1]
        post_words = deact[j].split()
        for k in range(min([len(pre_words), p])):
            try:
                word = pre_words[k]
                pre_tf[k][word] += 1
            except IndexError:
                print(act[j])
        
        for k in range(min([len(post_words), q])):
            try:
                word = post_words[k]
                post_tf[k][word] += 1
            except IndexError:
                print(deact[j])

    i += 1
    #if i >= limit: break
```

```
pre_tf.plot(idx=0) # the first closest pre words
pre_tf.plot(idx=1) # the second closest pre words
pre_tf.plot(idx=2) # the third
pre_tf.plot(idx=3)
```

```
post_tf.plot(idx=0)
post_tf.plot(idx=1)
post_tf.plot(idx=2)
```

In [11]:
#train_df.head()

In [12]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_fp))

100%|██████████| 4/4 [00:00<00:00, 1380.50it/s]

CPU times: user 6.93 ms, sys: 0 ns, total: 6.93 ms
Wall time: 5.71 ms





In [13]:
def clean_text(txt):
    ''' DO NOT DELETE: Official function for submission text cleaning '''
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
    #return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [14]:
# train data prep
TRAIN_DATA = []
if model == "specified":
    token_anno = "entities"
else:
    token_anno = "entities"


for index, row in tqdm(train_df.iterrows()):
    # get text of each sample test
    train_text = row['text']
    row_id = row['Id']
    label = row['cleaned_label']
    #text = sample_text.lower().split('.')
    text = re.split('[?.,;\n\t&!()]+', train_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin matching
    for sentence in text:
        # clean text 
        sentence = clean_text(sentence)
        indexed = KMP(label, sentence)
        if indexed != []:
            elist = []
            for i in indexed:
                entity = (i, len(label)+i, "DATASET") 
                elist.append(entity)
            dic = {token_anno:elist}
            x = (sentence, dic)
            TRAIN_DATA.append(x)

19661it [08:02, 40.73it/s] 


In [15]:
#load the model
print(spacy.require_gpu())

if model == "specified":
    nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
elif model == "generic":
    nlp = spacy.load("en_core_web_sm")
elif model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")


#set up the pipeline
if 'ner' not in nlp.pipe_names:
    #ner = nlp.create_pipe('ner') # version < 3.0
    #nlp.add_pipe(ner, last=True) # verions < 3.0
    nlp.add_pipe("ner", last=True) # version >= 3.0 only
    ner = nlp.get_pipe("ner")
else:
    ner = nlp.get_pipe('ner')

#for _, annotations in TRAIN_DATA:
#    for ent in annotations.get('entities'):
#        ner.add_label(ent[2])
ner.add_label("DATASET")


other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    if model == None: 
        optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],
                drop=0.5,  
                losses=losses)
            #nlp.update(
            #    [text],
            #    [annotations],
            #    drop=0.5,  
            #    losses=losses)
        print(losses)

True
Created blank 'en' model








































































































































































































































100%|██████████| 61735/61735 [32:11<00:00, 31.97it/s]
  0%|          | 4/61735 [00:00<28:25, 36.20it/s]

{'ner': 12763.427802929644}


100%|██████████| 61735/61735 [32:11<00:00, 31.96it/s]
  0%|          | 3/61735 [00:00<37:01, 27.78it/s]

{'ner': 8673.824000597931}


100%|██████████| 61735/61735 [32:11<00:00, 31.97it/s]
  0%|          | 4/61735 [00:00<31:24, 32.76it/s]

{'ner': 8453.836583715862}


100%|██████████| 61735/61735 [32:13<00:00, 31.92it/s]
  0%|          | 3/61735 [00:00<38:12, 26.93it/s]

{'ner': 8301.717404040895}


100%|██████████| 61735/61735 [32:16<00:00, 31.88it/s]
  0%|          | 3/61735 [00:00<34:36, 29.73it/s]

{'ner': 8278.960201145888}


100%|██████████| 61735/61735 [32:12<00:00, 31.94it/s]
  0%|          | 4/61735 [00:00<30:41, 33.52it/s]

{'ner': 8028.587963753984}


100%|██████████| 61735/61735 [34:06<00:00, 30.17it/s]
  0%|          | 3/61735 [00:00<36:42, 28.03it/s]

{'ner': 8248.407982161856}


100%|██████████| 61735/61735 [33:05<00:00, 31.09it/s]
  0%|          | 4/61735 [00:00<32:06, 32.04it/s]

{'ner': 8123.722051320774}


100%|██████████| 61735/61735 [33:23<00:00, 30.82it/s]
  0%|          | 3/61735 [00:00<36:36, 28.11it/s]

{'ner': 8056.539387324536}


100%|██████████| 61735/61735 [32:50<00:00, 31.33it/s]

{'ner': 8173.5872163282975}





In [16]:
# sample test
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    break

# save model
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Entities [('adni', 'DATASET')]
Saved model to output


In [17]:
# STRING MATCHING BLOCK
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
lables_list = []
# load model 'EN'
nlp2 = spacy.load(output_dir)
for index, row in tqdm(sample_sub.iterrows()):
    # get text of each sample test
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]

    cleaned_labels = temp_df['cleaned_label'].to_list()

    #text = sample_text.lower().split('.')
    text = re.split('[?.,;\n\t&!]', sample_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin search
    # matching
    
    for known_label in existing_labels:   # for each label in the known set
        # EXACT MATCH
        if known_label in sample_text.lower():   # find the EXACT label in text 
            cleaned_labels.append(clean_text(known_label)) # if found, then append to the list for further formatting
            
    for sentence in text:
        doc = nlp2(sentence.lower())

        for entity in doc.ents:
            if entity.label_ == 'DATASET':
                cleaned_labels.append(clean_text(entity.text))   
            
        # CASE 1: FUZZY MATCH
        #value = fuzz.partial_ratio(sentence.lower(), known_label) # I moved .lower() here
        #if value > 85 and value < 100:
            # print('value: ', str(value), known_label) # Alex, you might wanna see what this prints
            # cleaned_labels.append(clean_text(known_label))
    
        # CASE 2: for unknown labels
        # sentence filtering (Longest Consecutive Capitalization)
        #print(sentence)
#             length, rate, filtered_sentence = LCC(sentence)
#             if rate <= 0 or length == 0 or (length == 1 and not sentence.isupper()): 
#                 continue # no consecutive caps found
#             # <insert classifier here>
#             else:
#                 for keyword in ["dataset", "data", "database", "survey", "study", "research", "statistics"]:
#                     if keyword in filtered_sentence.lower():
#                         #pass
#                         cleaned_labels.append(clean_text(filtered_sentence)) # naive
        
    cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

4it [00:12,  3.04s/it]


In [18]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list

In [19]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
submission.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative ad...
1,2f392438-e215-4169-bebf-21ac4ff253e1,professional development in mathematics assess...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,coastal observation station|coastal erosion st...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,high school graduate|rural urban continuum codes


In [20]:
submission.to_csv('submission.csv', index=False)

In [21]:
for item in submission["PredictionString"]:
    print(item)
    print()

alzheimer s disease neuroimaging initiative adni|adni

professional development in mathematics assessment|engineering and engineering|program focusing on research and|common core of data|school teacher salaries|and teacher professional development in mathematics and science|trends in international mathematics and science study|professional development in mathematics content|early childhood education|professional development in science assessment|nces common core of data|professional development in mathematics|higher education and lower|national center for education|program for international student

coastal observation station|coastal erosion study|coastal management and|26|coastal resources commission|sea lake and overland surges from hurricanes|slosh model|high confidence of continued landward|noaa storm surge inundation|natural resources of national|coastal change science along|high tide in|slosh display

high school graduate|rural urban continuum codes

