In [1]:
import os
import re
import json
import glob
from collections import defaultdict
from functools import partial

import numpy as np 
import pandas as pd 
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt 

import nltk
import string
from fuzzywuzzy import fuzz

from model import longest_consecutive_caps as LCC
from model import KMP
# from model.model import longest_consecutive_caps as LCC

# Spacy model
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm
from spacy.training.example import Example



  from tqdm.autonotebook import tqdm


In [2]:
train_df = pd.read_csv("dataset/train.csv")
sample_sub = pd.read_csv('dataset/sample_submission.csv')
train_fp = "dataset/train/"
test_fp = "dataset/test/"

# train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")
# sample_sub = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
# train_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/train/"
# test_fp = "/kaggle/input/coleridgeinitiative-show-us-the-data/test/"

In [3]:
train_df.head(5)

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [5]:
def read_append_return(filename, train_files_path=train_fp, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [6]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

100%|██████████| 19661/19661 [00:10<00:00, 1794.76it/s]

CPU times: user 4.78 s, sys: 1.7 s, total: 6.48 s
Wall time: 11 s





In [7]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub(' +', ' ', str(text).lower()).strip()
    #text = ''.join([k for k in text if k not in string.punctuation])
    #text = re.sub('[^A-Za-z0-9\(\)]+', ' ', str(text).lower()).strip()
#     text = re.sub("/'+/g", ' ', text)
     
    return text

In [8]:
%%time
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

100%|██████████| 19661/19661 [00:52<00:00, 375.65it/s]


CPU times: user 49.9 s, sys: 2.65 s, total: 52.6 s
Wall time: 52.5 s


In [9]:
a = train_df["Id"].nunique()
b = train_df["Id"].size
print(a, b)
print(a/b)

14316 19661
0.7281420070189716


```
from model import search_sentences
from custom_classes import PosMap
```

```
%%time
tqdm.pandas()

i = 0
limit = 100
p = 5
q = 4
pre_tf = PosMap(p)
post_tf = PosMap(q)

for row in tqdm(train_df.iterrows()):
    label = row[1]["cleaned_label"]
    text = row[1]["text"].lower()
    act, deact = search_sentences(label, text, pre=p, post=q)

    for j in range(len(act)):
        pre_words = act[j].split()[::-1]
        post_words = deact[j].split()
        for k in range(min([len(pre_words), p])):
            try:
                word = pre_words[k]
                pre_tf[k][word] += 1
            except IndexError:
                print(act[j])
        
        for k in range(min([len(post_words), q])):
            try:
                word = post_words[k]
                post_tf[k][word] += 1
            except IndexError:
                print(deact[j])

    i += 1
    #if i >= limit: break
```

```
pre_tf.plot(idx=0) # the first closest pre words
pre_tf.plot(idx=1) # the second closest pre words
pre_tf.plot(idx=2) # the third
pre_tf.plot(idx=3)
```

```
post_tf.plot(idx=0)
post_tf.plot(idx=1)
post_tf.plot(idx=2)
```

In [10]:
train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,this study used data from the national educati...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,dropping out of high school is not necessarily...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,", stress satisfactory outcomes for all youth, ..."
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,federal reserve bank of richmond s1. accountin...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,this article investigates an important factor ...


In [11]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_fp))

100%|██████████| 4/4 [00:00<00:00, 840.50it/s]

CPU times: user 5.23 ms, sys: 2.26 ms, total: 7.49 ms
Wall time: 7.78 ms





In [12]:
def clean_text(txt):
    ''' DO NOT DELETE: Official function for submission text cleaning '''
    #return re.sub('[^A-Za-z0-9\(\)]+', ' ', str(txt).lower()).strip()
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [19]:
# train data prep
TRAIN_DATA = []
for index, row in tqdm(train_df.iterrows()):
    # get text of each sample test
    train_text = row['text']
    row_id = row['Id']
    label = row['cleaned_label']
    #text = sample_text.lower().split('.')
    text = re.split('[?.,;\n\t&!]', train_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin search
    # matching
    for sentence in text:
        # clean text 
        sentence = clean_text(sentence)
        dic = {}
        indexed = KMP(label, sentence)
        if indexed != []:
            dic['entities'] = []
            for i in indexed:
                entity = (i, len(label)+i, "DATASET") 
                dic['entities'].append(entity)
            x = (sentence, dic)
            TRAIN_DATA.append(x)

19661it [09:35, 34.14it/s] 


In [None]:
# train data process
model = None
output_dir=Path("/Users/alexsalman/Applications/CSE272_Title_Matching/output")
n_iter=5

#load the model

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
#     ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
ner = nlp.get_pipe('ner')
    
    
    
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)
        
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    
    

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Created blank 'en' model






  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist










































































































































































































































































































































































































































































 82%|████████▏ | 52724/64143 [14:22<03:22, 56.41it/s]

In [49]:
# STRING MATCHING BLOCK
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
lables_list = []
# load model 'EN'
nlp2 = spacy.load(output_dir)
for index, row in tqdm(sample_sub.iterrows()):
    # get text of each sample test
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]

    cleaned_labels = temp_df['cleaned_label'].to_list()

    #text = sample_text.lower().split('.')
    text = re.split('[?.,;\n\t&!]', sample_text) # can't have sample_text.lower() since I need to find consecutive caps

    # begin search
    # matching
    
    for known_label in existing_labels:   # for each label in the known set
        # EXACT MATCH
        if known_label in sample_text.lower():   # find the EXACT label in text 
            cleaned_labels.append(clean_text(known_label)) # if found, then append to the list for further formatting
            
    for sentence in text:
        doc = nlp2(sentence.lower())

        for entity in doc.ents:
            if entity.label_ == 'DATASET':
                cleaned_labels.append(clean_text(entity.text))
                print(entity.text)
                
            
            # CASE 1: FUZZY MATCH
            #value = fuzz.partial_ratio(sentence.lower(), known_label) # I moved .lower() here
            #if value > 85 and value < 100:
                # print('value: ', str(value), known_label) # Alex, you might wanna see what this prints
                # cleaned_labels.append(clean_text(known_label))
        
            # CASE 2: for unknown labels
            # sentence filtering (Longest Consecutive Capitalization)
            #print(sentence)
#             length, rate, filtered_sentence = LCC(sentence)
#             if rate <= 0 or length == 0 or (length == 1 and not sentence.isupper()): 
#                 continue # no consecutive caps found
#             # <insert classifier here>
#             else:
#                 for keyword in ["dataset", "data", "database", "survey", "study", "research", "statistics"]:
#                     if keyword in filtered_sentence.lower():
#                         #pass
#                         cleaned_labels.append(clean_text(filtered_sentence)) # naive
        

    cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

1it [00:00,  1.21it/s]

adni
trends in international mathematics and science study
students in higher
trends in international mathematics and science study
trends in teacher
school teacher salaries
common core of data
early childhood education
students in higher
trends in international mathematics and science study
students in science
trends in international mathematics and science study
trends in international mathematics and science study
trends in international mathematics and science study
beginning of the
beginning of the
beginning of the
beginning of the


2it [00:06,  3.73s/it]

beginning of the
beginning of the
~26
high confidence of continued
slosh display
slosh model
slosh grids
slosh display
slosh display
slosh and
slosh mom
coastal change science along


3it [00:08,  2.96s/it]

coastal observation station


4it [00:10,  2.56s/it]

trends in store





In [50]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list

In [51]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
submission.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,adni|alzheimer s disease neuroimaging initiati...
1,2f392438-e215-4169-bebf-21ac4ff253e1,school teacher salaries|trends in internationa...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,coastal change science along|noaa storm surge ...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,trends in store|rural urban continuum codes


In [52]:
submission.to_csv('submission.csv', index=False)

In [53]:
for item in submission["PredictionString"]:
    print(item)
    print()

adni|alzheimer s disease neuroimaging initiative adni 

school teacher salaries|trends in international mathematics and science study|nces common core of data|beginning of the|common core of data|students in higher|early childhood education|trends in teacher|students in science

coastal change science along|noaa storm surge inundation|slosh model| 26|slosh grids|slosh and|sea lake and overland surges from hurricanes|high confidence of continued|slosh mom|coastal observation station|slosh display

trends in store|rural urban continuum codes

