# Hierarchical Mining

## Environment Preparation

In [1]:
%load_ext autoreload
%autoreload 2

# generic imports
from enforce_typing import enforce_types
from collections.abc import Iterable
from typing import Callable, Any
import logging

# constants
DATA_DIR='../data'

# logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

## Data Preparation

### POS Tagging

In [2]:
# import flair
from flair.data import Sentence, Label
from flair.models import SequenceTagger
tagger = SequenceTagger.load("flair/pos-english")

# sentence = Sentence("I want to go to Burkina Faso")
# tagger.predict(sentence)

# for label in sentence.get_labels():
#     print(f"{label.data_point.text} - {label.value}")

  from .autonotebook import tqdm as notebook_tqdm


2023-05-26 15:23:08,343 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


#### Preprocessing Utilities

In [3]:
# parse out data from Flair label
@enforce_types
def get_flair_hierarchy(label: Label) -> tuple:
    return (label.value, label.shortstring)

# print(get_flair_hierarchy(label))

### Getting and Preprocessing Brown Data

In [4]:
# import brown dataset and stopwords from nltk
import nltk
nltk.download('brown')
nltk.download('stopwords')
from nltk.corpus import brown, stopwords

[nltk_data] Downloading package brown to /home/alex/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# preprocess and serialize brown sentences with tagger
import pickle
import re

@enforce_types
def hierarchicalize_brown_sentences(tagger: SequenceTagger, sentences: Iterable):
    """Preprocess data from brown using Flair tagging and lower the strings"""
    words_list = [' '.join(list(filter(lambda x: re.match(r"^[A-Za-z'\-]+$", x), sentence))) for sentence in sentences]
    flair_sentences = [Sentence(words) for words in words_list if words]
    batch_size = 1000   # Process sentences in batches of 1000 - https://github.com/flairNLP/flair/issues/1756#issuecomment-657194378
    
    for i in range(0, (len(sentences)//batch_size)+1):
        logger.info(f"Tagging corpus documents: {i*batch_size}:{(i+1)*batch_size}")
        tagger.predict(flair_sentences[i*batch_size:(i+1)*batch_size], mini_batch_size=32)

    hierarchicalized_sentences = []

    for flair_sentence in flair_sentences:
        items = []

        for label in flair_sentence.get_labels():
            item = get_flair_hierarchy(label)
            item = tuple(s.lower() for s in item)
            items.append(item)
    
        hierarchicalized_sentences.append(items)
    
    return hierarchicalized_sentences

# brown_sentences_hierarchicalized = hierarchicalize_brown_sentences(tagger, brown.sents())

# with open("data/brown_hierarchicalized.bin", 'wb') as file:
#     pickle.dump(brown_sentences_hierarchicalized, file)

In [7]:
# deserialize brown sentences
import pickle
brown_sentences_hierarchicalized = pickle.load(open(f'{DATA_DIR}/brown_hierarchicalized.bin', 'rb'))

## Hierarchical Item Frequent Pattern Mining

In [9]:
# get unique words from brown and put into tree\
from mwe_discovery.phrase_mining.fhptree import FHPTree

items = list(set(t for sentence in brown_sentences_hierarchicalized for t in sentence))
corpus = [[item[-1] for item in sentence] for sentence in brown_sentences_hierarchicalized]
tree = FHPTree(items)

In [10]:
# create hierarchical checker function
def hierarchical_equality_from_tree(tree: FHPTree) -> Callable[[Any, Any], bool]:
    '''Generate a hierarchical checking function, given a tree'''
    @enforce_types
    def _hierarchical_checker(a, b) -> bool:
        return a in {b, *tree.get_ancestors(b)} or b in {a, *tree.get_ancestors(a)}
    
    return _hierarchical_checker

hierarchical_equality_checker = hierarchical_equality_from_tree(tree)

# import itertools
# x = [corpus[0][1], corpus[0][2], 'nnp']
# for p in list(itertools.combinations(x, 2)):
#     print(f"{p[0]}=={p[1]}: {hierarchical_checker(*p)}")

In [11]:
# get the stop items and supported items in brown
import re
min_support = 25
item_counts = {}

for document in corpus:
    for item in document:
        if item in item_counts:
            item_counts[item] += 1
        else:
            item_counts[item] = 1

items = list(item_counts.keys())
stop_tokens = set(stopwords.words('english'))
stop_items = list()

for stop_token in stop_tokens:
    for item in items:
        if re.match(rf'^"{stop_token}"/[A-Za-z]+$', item):
            stop_items.append(item)

assert len(stop_items) == len(set(stop_items))
supported_items = [item for item in item_counts if item_counts[item]>=min_support]

In [12]:
# create the candidate generation function
from mwe_discovery.phrase_mining.hierarchical_representation_utils import candidate_generator_from_tree
item_candidate_generator = candidate_generator_from_tree(tree, stop_items)

ModuleNotFoundError: No module named 'phrase_mining'

In [62]:
# create the pattern checking function
from mwe_discovery.phrase_mining.data_structure_utils import sublist_checker
hierarchical_pattern_checker = lambda x, y: sublist_checker(x, y, hierarchical_equality_checker)

# _pattern = ('dt', '"jury"/nn')
# _pattern = ('"fulton"/nnp', '"county"/nnp')
# print([i for i in range(len(corpus)) if hierarchical_pattern_checker(_pattern, corpus[i])])

In [63]:
from mwe_discovery.phrase_mining.hierarchical_representation_utils import a_priori
frequent_patterns = a_priori(
    corpus=corpus,
    supported_tokens=supported_items,
    stop_tokens=stop_items,
    candidate_generator=item_candidate_generator,
    pattern_checker=hierarchical_pattern_checker,
    min_support=min_support)

('"even"/rb', '"the"/dt')
('"even"/rb', '"in"/in')
('"and"/cc', '"even"/rb')
('"even"/rb', '"a"/dt')
('"or"/cc', '"even"/rb')
('"but"/cc', '"even"/rb')
('"even"/rb', '"when"/wrb')
('"not"/rb', '"even"/rb')
('"n\'t"/rb', '"even"/rb')
('rb', '"even"/rb')
('"even"/rb', '"if"/in')
('"even"/rb', 'rb')
('"even"/rb', '"more"/rbr')
('"even"/rb', '"though"/in')
('"even"/rb', 'in')
('"the"/dt', '"national"/nnp')
('"national"/nnp', 'nnp')
('"dominant"/jj', '"stress"/nn')
('"dominant"/jj', 'nn')
('"the"/dt', '"free"/jj')
('"of"/in', '"free"/jj')
('"fort"/nnp', 'nnp')
('jj', '"stress"/nn')
('"the"/dt', '"old"/jj')
('"an"/dt', '"old"/jj')
('"of"/in', '"old"/jj')
('"old"/jj', '"man"/nn')
('"old"/jj', 'nn')
('"years"/nns', '"old"/jj')
('nns', '"old"/jj')
('jj', '"old"/jj')
('"old"/jj', 'jj')
('"felt"/vbd', '"the"/dt')
('"felt"/vbd', '"that"/in')
('"he"/prp', '"felt"/vbd')
('"i"/prp', '"felt"/vbd')
('"instead"/rb', '"of"/in')
('"is"/vbz', '"known"/vbn')
('"known"/vbn', '"as"/in')
('"the"/dt', '"two"/cd

In [66]:
list(frequent_patterns.keys())[-100:]

[('"at"/in', '"this"/dt', '"time"/nn'),
 ('"time"/nn', '"to"/in', 'nn'),
 ('"at"/in', '"that"/dt', '"time"/nn'),
 ('"the"/dt', 'jj', '"time"/nn'),
 ('"the"/dt', '"same"/jj', '"time"/nn'),
 ('"a"/dt', '"series"/nn', '"of"/in'),
 ('"a"/dt', '"little"/jj', 'jj'),
 ('"the"/dt', 'nn', '"line"/nn'),
 ('"it"/prp', '"might"/md', '"be"/vb'),
 ('"might"/md', '"have"/vb', '"been"/vbn'),
 ('"the"/dt', 'nn', '"table"/nn'),
 ('nnp', 'nnp', '"company"/nnp'),
 ('"the"/dt', '"national"/nnp', 'nnp', 'nnp'),
 ('"would"/md', '"like"/vb', '"to"/to', 'vb'),
 ('"a"/dt', '"lot"/nn', '"of"/in', 'nn'),
 ('"the"/dt', '"amount"/nn', '"of"/in', 'nn'),
 ('"the"/dt', '"center"/nn', '"of"/in', '"the"/dt'),
 ('"center"/nn', '"of"/in', '"the"/dt', 'nn'),
 ('"the"/dt', '"new"/nnp', '"york"/nnp', 'nnp'),
 ('"the"/dt', '"top"/nn', '"of"/in', '"the"/dt'),
 ('"the"/dt', '"department"/nnp', '"of"/in', 'nnp'),
 ('"i"/prp', '"do"/vbp', '"n\'t"/rb', '"know"/vb'),
 ('"i"/prp', '"do"/vbp', 'rb', '"know"/vb'),
 ('"on"/in', '"the"/

In [68]:
# output the frequent patterns that we found
with open("data/brown_phrases_hierarchical.txt", 'w') as file:
    file.write('\n'.join([str(k) for k in frequent_patterns.keys()]))