# Entropy filter for SQuAD-like dataset

In [1]:
import collections
import copy
import json
from math import log2
import matplotlib.pyplot as plt
import nltk
import os
import pandas as pd
import random
import re
import spacy
import statistics as st
import string
import threading
import time
from time import sleep

In [2]:
ARTIFACTS_PATH = '../artifacts'

In [3]:
LANGS = {
    'en': {
        'region_code': 'en',
        'lang_name': 'English',
        'lang_code': 'en-EN',
        'spacy_dict': 'en_core_web_sm',
    },
    'es': {
        'region_code': 'es',
        'lang_name': 'Spanish',
        'lang_code': 'es-ES',
        'spacy_dict': 'es_core_news_sm',
    },
    'ru': {
        'region_code': 'ru',
        'lang_name': 'Russian',
        'lang_code': 'ru-RU',
        'spacy_dict': 'ru_core_news_sm',
    },
    'vi': {
        'region_code': 'vi',
        'lang_name': 'Vietnamese',
        'lang_code': 'vi-VN',
        'spacy_dict': '',
    },
    'ja': {
        'region_code': 'ja',
        'lang_name': 'Japanese',
        'lang_code': 'ja-JP',
        'spacy_dict': 'ja_core_news_sm',
    },
}

## Tokenization

In [4]:
def get_word_tokenizer(dictionary_name=None, region_code=None, basic_token=False):
    if dictionary_name == '':
        if region_code == 'en':
            from spacy.lang.en import English as NlpTokenizer
        elif region_code == 'es':
            from spacy.lang.es import Spanish as NlpTokenizer
        elif region_code == 'ru':
            from spacy.lang.ru import Russian as NlpTokenizer
        elif region_code == 'vi':
            from spacy.lang.vi import Vietnamese as NlpTokenizer
        elif region_code == 'ja':
            from spacy.lang.ja import Japanese as NlpTokenizer
        else:
            raise Exception('Unknown region code: %s' % region_code)
        nlp = NlpTokenizer()
        nlp.add_pipe('sentencizer')
    else:
        nlp = spacy.load(dictionary_name)
    
    if basic_token:
        return lambda text : [token.text for token in nlp(text)]
    else:
        return lambda text : [(token.text, token.lemma_, [token.idx, token.idx + len(token.text)]) for token in nlp(text)]

In [5]:
def get_sent_tokenizer(dictionary_name=None, region_code=None):
    if region_code == 'vi':
        from spacy.lang.vi import Vietnamese
        nlp = Vietnamese()
        nlp.add_pipe('sentencizer')
    else:
        nlp = spacy.load(dictionary_name)
    
    return lambda text : [sent.text.strip() for sent in nlp(text).sents]

In [6]:
def normalize_text(text):
    text = ''.join(x for x in text if x not in set(string.punctuation))
    return text.lower().strip()

In [7]:
text = '¡Hola mundo! ¡Adiós mundo!'
region_code = 'es'

dictionary_name = LANGS[region_code]['spacy_dict']
tokenizer = get_word_tokenizer(dictionary_name=dictionary_name, region_code=region_code)
tokenizer(text)

[('¡', '¡', [0, 1]),
 ('Hola', 'Hola', [1, 5]),
 ('mundo', 'mundo', [6, 11]),
 ('!', '!', [11, 12]),
 ('¡', '¡', [13, 14]),
 ('Adiós', 'Adiós', [14, 19]),
 ('mundo', 'mundo', [20, 25]),
 ('!', '!', [25, 26])]

In [8]:
text = 'Hola mundo. ¡Adiós mundo!'
region_code = 'es'

dictionary_name = LANGS[region_code]['spacy_dict']
tokenizer = get_sent_tokenizer(dictionary_name=dictionary_name, region_code=region_code)
tokenizer(text)

['Hola mundo.', '¡Adiós mundo!']

## Formulas

In [13]:
def entropy_shannon(boe):
    total = sum(boe.values())
    return -1 * sum([freq / total * log2(freq / total) for freq in boe.values()])

In [15]:
text = 'this car this car this'
region_code = 'es'

tokenizer = get_word_tokenizer(dictionary_name=dictionary_name, region_code=region_code)
text = normalize_text(text)
words = [x[0] for x in tokenizer(text)]
boe = collections.Counter(words)

entropy_shannon(boe)

0.9709505944546686

## Stats

In [11]:
class ItemPicker():
    def __init__(self, dataset, timeout=60, sleep_interval=1):
        self._n_items = len(dataset)
        self._dataset_keys = list(dataset.keys())
        self._dataset_values = list(dataset.values())
        self._idx = 0
        self._timeout = timeout
        self._sleep_interval = sleep_interval
        self._locked = False
    
    def pick(self):
        self.lock()
        if self._idx >= self._n_items:
            item_name = None
            item_props = None
        else:
            item_name = self._dataset_keys.pop(0)
            item_props = self._dataset_values.pop(0)
            self._idx += 1
        self.unlock()
        return item_name, item_props

    def lock(self):
        start_time = time.time()
        while self._locked:
            end_time = time.time()
            if end_time - start_time >= self._timeout:
                raise Exception('Cannot pick an item (timeout)')
            sleep(self._sleep_interval)
        self._locked = True
    
    def unlock(self):
        self._locked = False

In [12]:
def filter_dataset_by_entropy(input_file, output_file, region_code,
                              min_word_entropy, max_word_entropy, verbose=False):
    dictionary_name = LANGS[region_code]['spacy_dict']
    stats = {
        'n_words': [],
        'entropy_shannon_chars': [],
        'entropy_shannon_words': [],
        'n_words_ps': [],
        'entropy_shannon_chars_ps': [],
        'entropy_shannon_words_ps': [],
    }

    basic_tokenizer = get_word_tokenizer(dictionary_name=dictionary_name, region_code=region_code, basic_token=True)
    sent_tokenizer = get_sent_tokenizer(dictionary_name=dictionary_name, region_code=region_code)

    with open(input_file, 'r', encoding='utf8') as fp:
        dataset = json.load(fp)

    if verbose:
        print('Filtering elements...')
    
    filtered_dataset = {'version': 'filtered_%s' % dataset['version'], 'data': []}
    for i, item in enumerate(dataset['data']):
        if verbose:
            print('- Item %d / %d' % (i + 1, len(dataset['data'])), end='\r')
        
        for paragraph_item in item['paragraphs']:
            context = paragraph_item['context']
            words = basic_tokenizer(normalize_text(context))
            freq_words = collections.Counter(words)
            words_entropy = entropy_shannon(freq_words)
            
            if words_entropy < min_word_entropy or words_entropy > max_word_entropy:
                continue
            
            filtered_dataset['data'].append({
                'title': item['title'],
                'paragraphs': [
                    {
                        'context': paragraph_item['context'],
                        'qas': paragraph_item['qas'],
                    }
                ]
            })
    if verbose:
        print()

    with open(output_file, 'w', encoding='utf8') as fp:
        json.dump(filtered_dataset, fp)
    
    return stats

In [13]:
def thread_main(end_status, item_picker, min_word_entropy, max_word_entropy):
    while True:
        dataset_name, dataset_props = item_picker.pick()
        if dataset_name is None:
            break
        print('Processing dataset: %s' % dataset_name)
        input_file, output_file, region_code = dataset_props
        filtered_dataset = filter_dataset_by_entropy(
                input_file, output_file, region_code, min_word_entropy, max_word_entropy)
        print('Finished dataset: %s' % dataset_name)
    end_status.append(1)

In [15]:
datasets = {
    'synthetic-es-top1': [
        '../data/synthetic_google_top_n/es/top_1/train-synthetic.json',
        '../data/synthetic_google_top_n/es/top_1/filtered-train-synthetic.json',
        'es'
    ],
    'synthetic-ru-top1': [
        '../data/synthetic_google_top_n/ru/top_1/train-synthetic.json',
        '../data/synthetic_google_top_n/ru/top_1/filtered-train-synthetic.json',
        'ru'
    ],
    'synthetic-vi-top1': [
        '../data/synthetic_google_top_n/vi/top_1/train-synthetic.json',
        '../data/synthetic_google_top_n/vi/top_1/filtered-train-synthetic.json',
        'vi'
    ],
    'synthetic-ja-top1': [
        '../data/synthetic_google_top_n/ja/top_1/train-synthetic.json',
        '../data/synthetic_google_top_n/ja/top_1/filtered-train-synthetic.json',
        'ja'
    ],
    'synthetic-es-top2': [
        '../data/synthetic_google_top_n/es/top_2/train-synthetic.json',
        '../data/synthetic_google_top_n/es/top_2/filtered-train-synthetic.json',
        'es'
    ],
    'synthetic-ru-top2': [
        '../data/synthetic_google_top_n/ru/top_2/train-synthetic.json',
        '../data/synthetic_google_top_n/ru/top_2/filtered-train-synthetic.json',
        'ru'
    ],
    'synthetic-vi-top2': [
        '../data/synthetic_google_top_n/vi/top_2/train-synthetic.json',
        '../data/synthetic_google_top_n/vi/top_2/filtered-train-synthetic.json',
        'vi'
    ],
    'synthetic-ja-top2': [
        '../data/synthetic_google_top_n/ja/top_2/train-synthetic.json',
        '../data/synthetic_google_top_n/ja/top_2/filtered-train-synthetic.json',
        'ja'
    ],
    'synthetic-es-top3': [
        '../data/synthetic_google_top_n/es/top_3/train-synthetic.json',
        '../data/synthetic_google_top_n/es/top_3/filtered-train-synthetic.json',
        'es'
    ],
    'synthetic-ru-top3': [
        '../data/synthetic_google_top_n/ru/top_3/train-synthetic.json',
        '../data/synthetic_google_top_n/ru/top_3/filtered-train-synthetic.json',
        'ru'
    ],
    'synthetic-vi-top3': [
        '../data/synthetic_google_top_n/vi/top_3/train-synthetic.json',
        '../data/synthetic_google_top_n/vi/top_3/filtered-train-synthetic.json',
        'vi'
    ],
    'synthetic-ja-top3': [
        '../data/synthetic_google_top_n/ja/top_3/train-synthetic.json',
        '../data/synthetic_google_top_n/ja/top_3/filtered-train-synthetic.json',
        'ja'
    ],
    'synthetic-es-top5': [
        '../data/synthetic_google_top_n/es/top_5/train-synthetic.json',
        '../data/synthetic_google_top_n/es/top_5/filtered-train-synthetic.json',
        'es'
    ],
    'synthetic-ru-top5': [
        '../data/synthetic_google_top_n/ru/top_5/train-synthetic.json',
        '../data/synthetic_google_top_n/ru/top_5/filtered-train-synthetic.json',
        'ru'
    ],
    'synthetic-vi-top5': [
        '../data/synthetic_google_top_n/vi/top_5/train-synthetic.json',
        '../data/synthetic_google_top_n/vi/top_5/filtered-train-synthetic.json',
        'vi'
    ],
    'synthetic-ja-top5': [
        '../data/synthetic_google_top_n/ja/top_5/train-synthetic.json',
        '../data/synthetic_google_top_n/ja/top_5/filtered-train-synthetic.json',
        'ja'
    ],
}

In [16]:
item_picker = ItemPicker(datasets)
n_threads = 10

min_word_entropy = 5
max_word_entropy = 7

all_main_threads = []
end_status = []
for _ in range(n_threads):
    x = threading.Thread(
        target=thread_main,
        args=(end_status, item_picker, min_word_entropy, max_word_entropy))
    x.start()
    all_main_threads.append(x)
    sleep(1)

for i, x in enumerate(all_main_threads):
    x.join()
    print('Finished %d / %d threads...' % (i + 1, n_threads))

Processing dataset: synthetic-vi-top1
Processing dataset: synthetic-vi-top2
Processing dataset: synthetic-vi-top3
Processing dataset: synthetic-vi-top5
Finished dataset: synthetic-vi-top1
Finished 1 / 10 threads...
Finished dataset: synthetic-vi-top2
Finished 2 / 10 threads...
Finished dataset: synthetic-vi-top3
Finished 3 / 10 threads...
Finished dataset: synthetic-vi-top5
Finished 4 / 10 threads...
Finished 5 / 10 threads...
Finished 6 / 10 threads...
Finished 7 / 10 threads...
Finished 8 / 10 threads...
Finished 9 / 10 threads...
Finished 10 / 10 threads...
