# Phrase (collocation) Detection Exercise

###### Author: Alex Sherman | alsherman@deloitte.com

#### Agenda
1. SpaCy POS phrases
2. Gensim Phrases and Phraser

In [None]:
from collections import defaultdict
from configparser import ConfigParser, ExtendedInterpolation

from IPython.core.display import display, HTML
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.lang.en.stop_words import STOP_WORDS
from sqlalchemy import create_engine

pd.set_option('display.max_colwidth', 100)

In [None]:
# configuration for data, acronyms, and gensim paths
config = ConfigParser(interpolation=ExtendedInterpolation())
config.read('../../config.ini')

DB_PATH = config['DATABASES']['PROJECT_DB_PATH']
MATCHED_TEXT_PATH = config['NLP']['MATCHED_TEXT_PATH']
CLEANED_TEXT_PATH = config['NLP']['CLEANED_TEXT_PATH']
GENSIM_DICTIONARY_PATH = config['NLP']['GENSIM_DICTIONARY_PATH']
GENSIM_CORPUS_PATH = config['NLP']['GENSIM_CORPUS_PATH']

#### Load data

In [None]:
# count # of documents
engine = create_engine(DB_PATH)
pd.read_sql("SELECT COUNT(*) FROM pubmed ", con=engine)

In [None]:
df = pd.read_sql("SELECT * FROM pubmed LIMIT 500", con=engine)

# filter to relevant sections
df.head()

In [None]:
# store section matches in list
text = [text for text in df['text']]

# review first sentence of a section match
len(text)

### SpaCy - Preprocessing

In [None]:
%%time

# load spacy nlp model
# use 'en' if you don't have the lg model
nlp = spacy.load('en_core_web_lg')

###### collect sentences using SpaCy matcher

In [None]:
def collect_phrase_model_sents(matcher, doc, i, matches):
    # identify matching spans (phrases)
    match_id, start, end = matches[i]
    
    # get sentence with matched term
    sent = doc[start:end].sent.text
    
    # collect matching (cleaned) sents
    matched_sents.append(sent)

##### match sentences

https://explosion.ai/demos/matcher

In [None]:
%%time

# remove false statement below to run code
if 1 == 1:
    # match sentences with the word disease or disorder
    matched_sents = []
    pattern = [[{'LOWER': 'disease'}], [{'LOWER': 'disorder'}]]

    matcher = Matcher(nlp.vocab)

    # use *patterns to add more than one pattern at once
    matcher.add('disease_disorder', collect_phrase_model_sents, *pattern)

    for doc in nlp.pipe(text, disable=['tagger','ner']):    
        matcher(doc)

In [None]:
print('Number of matches: {} \n'.format(len(matched_sents)))

print('Example Match:')
print(matched_sents[0])

##### Export matched text to avoid repeating processing

In [None]:
# view path to matched text
MATCHED_TEXT_PATH

In [None]:
# uncomment below to write the matched text to a .txt file for later use 

#with open(MATCHED_TEXT_PATH, 'w') as f:
#    for line in matched_sents:
#        line += '\n'
#        line = line.encode('ascii', errors='ignore').decode('ascii') 
#        f.write(line)

In [None]:
# read matched text
with open(MATCHED_TEXT_PATH, 'r') as f:
    matched_sents_full = [line.strip() for line in f.readlines()]

In [None]:
matched_sents_full[0]

In [None]:
# store all matched sentences in a dataframe
matches_df = pd.DataFrame(matched_sents_full, columns=['sentences'])

# remove duplicates
matches_df = matches_df.drop_duplicates()

# recreate matched_sents (since it takes so long to create on its own)
matched_sents = [sent[0].split() for sent in matches_df.values]

# view matches
matches_df.head()

### Use SpaCy part of speech (POS) to create phrases

In [None]:
# combine the matched sentence tokens and parse it with SpaCy
text = ' '.join(matched_sents[2])
text

##### Determine which NLP components can be disabled

In [None]:
def view_pos(doc, n_tokens=5):
    """ print SpaCy POS information about each token in a provided document """
    print('{:15} | {:10} | {:10} | {:30}'.format('TOKEN','POS','DEP_','LEFTS'))
    for token in doc[0:n_tokens]:
        print('{:15} | {:10} | {:10} | {:30}'.format(
            token.text, token.head.pos_,token.dep_, str([t.text for t in token.lefts])))

In [None]:
# observe which part of speech (pos) attributes are disabled by named entity recognition (ner)
pos_doc = nlp(text, disable=['ner'])
view_pos(pos_doc)

In [None]:
# observe which part of speech (pos) attributes are disabled by parser
pos_doc = nlp(text, disable=['ner','parser'])
view_pos(pos_doc)

In [None]:
# observe which part of speech (pos) attributes are disabled by tagger
pos_doc = nlp(text, disable=['ner','tagger'])
view_pos(pos_doc, n_tokens=5)

In [None]:
# use explain to define any token.dep_ attributes
spacy.explain('dobj')

In [None]:
dependency_parsing_labels_url = 'https://spacy.io/api/annotation#dependency-parsing'
iframe = '<iframe src={} width=1000 height=400></iframe>'.format(dependency_parsing_labels_url)
HTML(iframe)

##### Extract phrases by identifying tokens describing an object

In [None]:
# add stop words to SpaCy
# this enables the .is_stop attribute with common stop words
from spacy.lang.en.stop_words import STOP_WORDS

for word in STOP_WORDS:
    lex = nlp.vocab[word]
    lex.is_stop = True

In [None]:
displacy_doc = nlp(text)

# show visualization in Jupyter Notebook
displacy.render(docs=displacy_doc, style='dep', jupyter=True)

In [None]:
def create_pos_phrases(doc):

    phrases = [] 

    doc = nlp(doc, disable=['ner','tagger'])
    for token in doc:
        direct_object = 'obj' in token.dep_
        if direct_object:
            # find any dependent terms to the left of (preceeding) the object
            for left_term in (t.text for t in token.lefts if not t.is_stop):
                # combine the dependent term and object, separated by an underscore
                # e.g. travel agency ==> travel_agency
                phrase = '{}_{}'.format(left_term,token.text)
                phrases.append(phrase)
    
    # convert list of distinct phrases into a sentence
    return ' '.join(set(phrases))

In [None]:
# review data
matches_df.head()

In [None]:
%%time

for sent in matched_sents_full[0:5]:
    print(create_pos_phrases(sent))

In [None]:
%%time

# apply the custom function to every element in the dataframe
matches_df[0:5].sentences.apply(create_pos_phrases)

##### Pandas Apply

apply is an efficient and fast approach to 'apply' a function to every element in a row. applymap does the same to every element in the entire dataframe (e.g. convert all ints to floats)

Example: https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_dataframes/

In [None]:
# create a small dataframe with example data
example_data = {'col1':range(0,3),'col2':range(3,6)}
test_df = pd.DataFrame(example_data)
test_df

In [None]:
# apply a built-in function to each element in a column
test_df['col1'].apply(float)

In [None]:
# apply a custom function to every element in a column
def add_five(row):
    return row + 5

test_df['col1'].apply(add_five)

In [None]:
# apply an annonomous function to every element in a column
test_df['col1'].apply(lambda x: x+5)

In [None]:
# apply a built-in function to every element in a dataframe 
test_df.applymap(float)  # applymap

In [None]:
# create a new empty column
matches_df['create_pos_phrases'] = ''

# apply the custom function to every element in the dataframe
matches_df.loc[0:5, 'create_pos_phrases'] = matches_df[0:5].sentences.apply(create_pos_phrases)
matches_df.head()

### Collocations

"A collocation is an expression consisting of two or more words that
correspond to some conventional way of saying things. Or in the words
of Firth (1957: 181): “Collocations of a given word are statements of the
habitual or customary places of that word.” Collocations include noun
phrases like strong tea and weapons of mass destruction, phrasal verbs like
to make up, and other stock phrases like the rich and powerful. Particularly
interesting are the subtle and not-easily-explainable patterns of word usage
that native speakers all know: why we say a stiff breeze but not a stiff wind
(while either a strong breeze or a strong wind is okay), or why we speak of
broad daylight (but not bright daylight or narrow darkness)



There are actually different definitions of the notion of collocation. Some
authors in the computational and statistical literature define a collocation
as two or more consecutive words with a special behavior, for example
Choueka (1988):
[A collocation is defined as] a sequence of two or more consecutive
words, that has characteristics of a syntactic and semantic
unit, and whose exact and unambiguous meaning or connotation
cannot be derived directly from the meaning or connotation of its
components. In most linguistically oriented research, a phrase
can be a collocation even if it is not consecutive (as in the example knock
. . . door). The following criteria are typical of linguistic treatments of collocations:

**Non-compositionality**: The meaning of a collocation is not a straightforward
composition of the meanings of its parts. Either the meaning
is completely different from the free combination (as in the case of idioms
like kick the bucket) or there is a connotation or added element of
meaning that cannot be predicted from the parts. For example, white
wine, white hair and white woman all refer to slightly different colors, so
we can regard them as collocations. 

**Non-substitutability**: We cannot substitute near-synonyms for the
components of a colloction. For example, we can’t say yellow wine
instead of white wine even though yellow is as good a description of the
color of white wine as white is (it is kind of a yellowish white).

**Non-modifiability**: Many collocations cannot be freely modified with
additional lexical material or through grammatical transformations.
This is especially true for frozen expressions like idioms. For example,
we can’t modify frog in to get a frog in one’s throat into to get an ugly
frog in one’s throat although usually nouns like frog can be modified by
adjectives like ugly. Similarly, going from singular to plural can make
an idiom ill-formed, for example in people as poor as church mice."

SOURCE: https://nlp.stanford.edu/fsnlp/promo/colloc.pdf

### Exercise

Create a function that returns a window of size n over a given sentence. 

For the sentence **'rather than pay the fee'** return the following if the window is n=3:
- ['rather', 'than', 'pay'],
- ['than','pay','the']
- ['pay', 'the','fee']
- ...


In [None]:
# example sentence
sent = ' '.join(matches_df['sentences'][0:1]).split()
print(sent)

In [None]:
def create_sentence_windows(sentence, n=3):
    "create a sliding window over the n terms in a list of terms"
        
    # create a window on the first n terms by slicing the sentence into the first n terms
    window = 
    
    # create a list to store all windows
    # add the first window that was created above
    sentence_windows = 

    # iterate through the rest of the terms of the sentence
    # e.g. if n=3, then create a new window with terms 2 to 4
    for :
        # remove the first terms of the window and add the next term from the sentence
        window = 
        # add the updated window to the master list
        sentence_windows.append()

    return sentence_windows

# execute the function
sentence_window = create_sentence_windows(sent, n=3)
# view the first few results
sentence_window[0:5]

In [None]:
# execute the function for all sentences

# create a list to store all windows
sentence_window = []

for sent in matches_df['sentences']:
    # convert the sentence string into a list of terms
    sent = sent.split()
    
    # create the sentence windows and append to the sentence_windows list
    windows = create_sentence_windows(sent, n=3)
    
    # add each window to the sentence_window list
    # iterate through windows to make each item in sentence window a window, not a list of windows
    for window in windows:
        sentence_window.append(window)

# view the first five results
sentence_window[0:5]

In [None]:
from itertools import combinations
from collections import defaultdict

# create a defaultdict to keep track of common phrases
window_count = defaultdict(int)

for sent in sentence_window:
    # remove stop words
    sentence = [term for term in sent if term not in STOP_WORDS]
    
    # create a combination of terms
    # e.g. (rather, than, pay) --> (rather,than), (than,pay), (rather,pay)
    for combo in combinations(sentence, 2):
        # convert the tuple to a term
        # e.g. (rather, than) --> 'rather_than'
        phrase = '_'.join(combo)
        
        # increment the count for the term each time it appears to identify the most common terms
        window_count[phrase] += 1

# sort to view the most common terms
# the key (lambda x: x[1]) sorts by the count
sorted(window_count.items(), key=lambda x: x[1], reverse=True)[0:20]

### Phrase (collocation) Detection

Phrase modeling is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our reviews and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is:

$$\frac{count(A\ B) - count_{min}}{count(A) * count(B)} > threshold$$

- $count(A\ B)$ is the number of times the tokens $A\ B$ appear in the corpus in order
- $count_{min}$ is a user-defined parameter to ensure that accepted phrases occur a minimum number of times
- $count(A)$ is the number of times token $A$ appears in the corpus
- $count(B)$ is the number of times token $B$ appears in the corpus
- $threshold$ is a user-defined parameter to control how strong of a relationship between two tokens the model requires before accepting them as a phrase

Once our phrase model has been trained on our corpus, we can apply it to new text. When our model encounters two tokens in new text that identifies as a phrase, it will merge the two into a single new token.

Phrase modeling is superficially similar to named entity detection in that you would expect named entities to become phrases in the model (so new york would become new_york). But you would also expect multi-word expressions that represent common concepts, but aren't specifically named entities (such as happy hour) to also become phrases in the model.

We turn to the indispensible gensim library to help us with phrase modeling — the Phrases class in particular.

SOURCE: 
- https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb
- https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf

##### Scikit-learn API for Gensim

In [None]:
print(matched_sents[0:10])

In [None]:
from gensim.sklearn_api.phrases import PhrasesTransformer

sklearn_phrases = PhrasesTransformer(min_count=5, threshold=5)
sklearn_phrases.fit(matched_sents)

In [None]:
# review phrase matches
phrases = []
for terms in sklearn_phrases.transform(matched_sents):
    for term in terms:
        if term.count('_') >= 1:
            phrases.append(term)

print(set(phrases))

In [None]:
# create a list of stop words
from spacy.lang.en.stop_words import STOP_WORDS
common_terms = list(STOP_WORDS)

**common_terms:** optional list of “stop words” that won’t affect frequency count of expressions containing them.
- The common_terms parameter add a way to give special treatment to common terms (aka stop words) such that their presence between two words won’t prevent bigram detection. It allows to detect expressions like “bank of america” or “eye of the beholder”.


##### Gensim API
A more complex API, though it is faster and has better integration with other gensim components (e.g. Phraser)

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
phrases = Phrases(
      matched_sents
    , common_terms=common_terms
    , min_count=5
    , threshold=5
    , scoring='default'
)

phrases

### Phrases Params

- **scoring:** specifies how potential phrases are scored for comparison to the threshold setting. scoring can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. Two built-in scoring functions are available by setting scoring to a string:

    - ‘default’: from “Efficient Estimaton of Word Representations in Vector Space” by Mikolov, et. al.: 
    
$$\frac{count(AB) - count_{min}}{count(A) * count(B)} * N > threshold$$
    

    - where N is the total vocabulary size.
    - Thus, it is easier to exceed the threshold when the two words occur together often or when the two words are rare (i.e. small product)

In [None]:
bigram = Phraser(phrases)

bigram

The phrases object still contains all the source text in memory. A gensim Phraser will remove this extra data to become smaller and somewhat faster than using the full Phrases model. To determine what data to remove, the Phraser ues the  results of the source model’s min_count, threshold, and scoring settings. (You can tamper with those & create a new Phraser to try other values.)

SOURCE: https://radimrehurek.com/gensim/models/phrases.html

In [None]:
def print_phrases(phraser, text_stream, num_underscores=2):
    """ identify phrases from a text stream by searching for terms that
        are separated by underscores and include at least num_underscores
    """
    
    phrases = []
    for terms in phraser[text_stream]:
        for term in terms:
            if term.count('_') >= num_underscores:
                phrases.append(term)
    print(set(phrases))

In [None]:
print_phrases(bigram, matched_sents)

### Tri-gram phrase model

We can place the text from the first phrase model into another Phrases object to create n-term phrase models. We can repear this process multiple times.

In [None]:
phrases = Phrases(
      bigram[matched_sents]
    , common_terms=common_terms
    , min_count=1
    , threshold=1
)

trigram = Phraser(phrases)

print_phrases(trigram, bigram[matched_sents], num_underscores=3)

In [None]:
for doc_num in [1]:
    print('DOC NUMBER: {}\n'.format(doc_num))
    print('ORIGINAL SENTENT: {}\n'.format(' '.join(matched_sents[doc_num])))
    print('BIGRAM: {}\n'.format(' '.join(bigram[matched_sents[doc_num]])))
    print('TRIGRAM: {}'.format(' '.join(trigram[bigram[matched_sents[doc_num]]])))
    print()

#### Export Cleaned Text

In [None]:
# write the cleaned text to a new file for later use

#with open(CLEANED_TEXT_PATH, 'w') as f:
#    for line in bigram[matched_sents]:
#        line = ' '.join(line) + '\n'
#        line = line.encode('ascii', errors='ignore').decode('ascii')
#        f.write(line)