# String Mining

## Environment Preparation

In [3]:
# Make imports reload modules
%load_ext autoreload
%autoreload 2
DATA_DIR='../data'

## Data Preparation

In [4]:
# import brown dataset and stopwords from nltk
import nltk
nltk.download('brown')
nltk.download('stopwords')
from nltk.corpus import brown, stopwords

[nltk_data] Downloading package brown to /home/alex/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# preprocess data from brown
import re

def preprocess_brown_sentence(sentence):
    """Preprocess data from brown"""
    return ' '.join(list(filter(lambda x: re.match(r"^[A-Za-z'\-]+$", x), sentence))).lower()

brown_sentences_lowered = [preprocess_brown_sentence(sentence) for sentence in brown.sents()][:1000]

## Substring Mining

In [6]:
# get unique words from brown
from mwe_discovery.phrase_mining.string_utils import get_words
corpus = brown_sentences_lowered
tokens = get_words(corpus)

In [7]:
# get the counts of words in brown
min_support = 25
token_counts = {token: 0 for token in tokens}

for document in corpus:
    words = document.split(' ')

    for word in words:
        if word in token_counts:
            token_counts[word] += 1

supported_tokens = [word for word in token_counts if token_counts[word]>=min_support]

In [8]:
# get frequent patterns in brown
from mwe_discovery.phrase_mining.string_utils import text_pattern_checker, get_candidate_strings
from mwe_discovery.phrase_mining.algorithms import a_priori
pattern_checker = text_pattern_checker
stop_tokens = set(stopwords.words('english'))

frequent_patterns = a_priori(
    corpus=corpus,
    supported_tokens=supported_tokens,
    stop_tokens=stop_tokens,
    candidate_generator=get_candidate_strings,
    pattern_checker=text_pattern_checker,
    min_support=min_support)

In [9]:
# output the frequent patterns that we found
print(frequent_patterns.keys())

with open(f"{DATA_DIR}/brown_phrases_new.txt", 'w') as file:
    file.write('\n'.join(frequent_patterns.keys()))

dict_keys(['the', 'county', 'said', 'an', 'of', 'election', 'no', "''", 'that', 'in', 'city', 'committee', 'which', 'had', 'and', 'for', 'was', 'been', 'by', 'to', 'a', 'this', 'it', 'are', 'or', 'have', 'on', 'other', 'two', 'should', 'be', 'administration', 'is', 'as', 'also', 'at', 'state', 'one', 'program', 'but', 'has', 'with', 'they', 'we', 'some', 'will', 'its', 'from', 'new', 'not', 'there', 'plan', 'tax', 'his', 'more', 'than', 'year', 'home', 'council', 'he', 'who', 'after', 'would', 'party', 'up', 'out', '--', 'were', 'first', 'made', 'million', 'house', 'last', 'school', 'i', 'democratic', 'bill', 'their', 'if', 'president', 'states', 'government', 'united', 'the city', 'the state', 'he said', 'would be'])
