# Phrase (collocation) Detection Solution

###### Author: Alex Sherman | alsherman@deloitte.com

#### Agenda
1. Acronym replacement
2. SpaCy POS phrases
3. Gensim Phrases and Phraser

In [266]:
import spacy
import pandas as pd
from sqlalchemy import create_engine
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from collections import defaultdict
from spacy.lang.en.stop_words import STOP_WORDS
from IPython.core.display import display, HTML
from configparser import ConfigParser, ExtendedInterpolation

In [267]:
# configuration for data, acronyms, and gensim paths
config = ConfigParser(interpolation=ExtendedInterpolation())
config.read('../../config.ini')

DB_PATH = config['DATABASES']['PROJECT_DB_PATH']
AIRLINE_ACRONYMS_FILEPATH = config['NLP']['AIRLINE_ACRONYMS_FILEPATH']
AIRLINE_MATCHED_TEXT_PATH = config['NLP']['AIRLINE_MATCHED_TEXT_PATH']
AIRLINE_CLEANED_TEXT_PATH = config['NLP']['AIRLINE_CLEANED_TEXT_PATH']
GENSIM_DICTIONARY_PATH = config['NLP']['GENSIM_DICTIONARY_PATH']
GENSIM_CORPUS_PATH = config['NLP']['GENSIM_CORPUS_PATH']

#### Load data on airline fees

In [152]:
engine = create_engine(DB_PATH)
df = pd.read_sql("SELECT * FROM Sections", con=engine)

# the annual report from 1992 was scanned in poor quality
# and the text was not legible
df = df[df.filename != 'southwest-airlines-co_annual_report_1992.docx']

# filter to relevant sections
df = df[df['section_text'].str.contains('fee')]
df.head()

Unnamed: 0,section_id,filename,section_name,section_text,criteria,section_length
291,292,southwest-airlines-co_annual_report_1994.docx,DEPARTMENT OF TRANSPORTATION RANKINGS FOR 1994...,A multitude of challenges faced the People of ...,"<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",2849
297,298,southwest-airlines-co_annual_report_1994.docx,RESULTS OF OPERATIONS,1994 COMPARED WITH 1993 The Company's consolid...,"<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",13806
305,306,southwest-airlines-co_annual_report_1994.docx,ACQUISITION,"On December 31, 1993, Southwest exchanged 3,57...","<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",2141
308,309,southwest-airlines-co_annual_report_1994.docx,ACCRUED LIABILITIES (IN THOUSANDS) LONG-TERM D...,"On March 1, 1993, the Company redeemed the $10...","<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",1855
359,360,southwest-airlines-co_annual_report_1995.docx,SECRET NUMBER 1 STICK TO WHAT YOU’RE GOOD AT.,"Since 1971, Southwest Airlines has offered sin...","<, f, u, n, c, t, i, o, n, , s, t, y, l, e, ...",2566


In [153]:
# store section matches in list
text = [section for section in df['section_text'].values]

# review first sentence of a section match
text[0][0:299]

'A multitude of challenges faced the People of Southwest Airlines in 1994. The mark of a true champion is the ability to “rise to the occasion” and meet challenges. We believe our Employees showed their true Southwest Spirit in 1994, accomplishing three- or four-fold what a normal year would  bring.'

### SpaCy - Preprocessing

In [154]:
%%time

# load spacy nlp model
# use 'en' if you don't have the lg model
nlp = spacy.load('en_core_web_lg')

Wall time: 1min 28s


##### Text Preprocessing - Acronyms

SOURCE: https://www.faa.gov/airports/resources/acronyms/

In [155]:
# read csv with airline industry acronyms
airline_acronyms = pd.read_csv(AIRLINE_ACRONYMS_FILEPATH)
airline_acronyms.head(3)

Unnamed: 0,Acronym,Definition
0,A/C,Aircraft
1,A/G,Air to Ground
2,A/H,Altitude/Height


##### Exercise

**Curate the acronyms:**

1. Convert the acronyms into a dict
2. Clean acronyms and definitions (replace spaces with underscores, strip text, lowercase)
3. Remove any acronyms that are < two characters (e.g. 'at' == 'air traffic')

In [156]:
acronyms = {}

for ind, row in airline_acronyms.iterrows():
    # convert acronyms to lowercase
    acronym = row['Acronym'].lower()
    
    # clean acronym definitions
    definition = row['Definition'].lower().strip().replace(' ','_')
    
    # ignore two character acronyms as they often match actual words
    # e.g. 'at' == 'air traffic'
    if len(acronym) > 2:
        acronyms[acronym] = definition

# view the first few acronyms
list(acronyms.items())[0:5]  # convert to list as dict is not subscriptable

[('agl', 'above_ground_level'),
 ('afb', 'air_force_base'),
 ('aig', 'airbus_industries_group'),
 ('iap', 'instrument_approach_procedures'),
 ('malsr', 'mals_with_runway_alignment_indicator_lights')]

##### Identify acronyms that exist in text
WARNING SLOW!

In [157]:
%%time

# review the acronyms
acronym_matches = []

# create a nlp pipe to iterate through the text
for doc in nlp.pipe(text, disable=['tagger','ner']):    
    # iterate through the sentences of the doc
    for sent in doc.sents:
        # iterate through each word in the sentence
        for token in sent.text.split(' '):
            token = token.lower()
            # check if token is an acronym
            # add matches (acronym and definition) to acronym_matches
            if token in acronyms:
                acronym_matches.append((token, acronyms[token]))

# review all matching acronyms      
for match in set(acronym_matches):
    print(match)

('cat', 'clear')
('rnp', 'required_navigation_performance')
('mou', 'memorandum_of_understanding')
('app', 'approach')
('grade', 'graphical_airspace_design_environment')
('asm', 'available_seat_mile')
('tops', 'telecommunications_ordering_and_pricing_system_(gsa_software_tool)')
('dot', 'department_of_transportation')
('tsa', 'taxiway_safety_area')
('par', 'preferential_arrival_route')
('ata', 'air_transport_association_of_america')
('aid', 'airport_information_desk')
('faa', 'federal_aviation_administration')
('self', 'simplified_short_approach_lighting_system_with_sequenced_flashing_lights')
('basic', 'basic_contract_observing_station')
('gps', 'global_positioning_system')
('did', 'direct_inward_dial')
('far', 'federal_aviation_regulation')
Wall time: 11min 26s


In [158]:
# update acronyms list to remove ambiguous acronyms
acronyms_to_remove = ['cat','app','grade','self','basic','did','far']
for term in acronyms_to_remove:
    acronyms.pop(term)

###### collect sentences about fees for phrase model

In [176]:
def collect_phrase_model_sents(matcher, doc, i, matches):
    # identify matching spans (phrases)
    match_id, start, end = matches[i]
    span = doc[start : end]
    
    # keep only words, lemmatize tokens, remove punctuation
    sent = [str(token.lemma_).lower() 
            for token in span.sent if token.is_alpha]
    
    # replace acronyms
    sent = [acronyms[token] if token in acronyms 
            else token for token in sent]

    # collect matching (cleaned) sents
    matched_sents.append(sent)

##### match sentences with the word fee or fees

WARNING SLOW!

In [177]:
%%time 

# match sentences with the word fee or fees
matched_sents = []
pattern = [[{'LOWER': 'fee'}], [{'LOWER': 'fees'}]]

matcher = Matcher(nlp.vocab)
# use *patterns to add more than one pattern at once
matcher.add('fees', collect_phrase_model_sents, *pattern)

for doc in nlp.pipe(text, disable=['tagger','ner']):    
    matcher(doc)

Wall time: 11min 27s


In [234]:
print('Number of matches: {} \n'.format(len(matched_sents)))

print('Example Match:')
print(matched_sents[0])

Number of matches: 454 

Example Match:
['rather', 'than', 'pay', 'the', 'fee', 'demand', 'by', 'this', 'crss', 'we', 'respond', 'quickly', 'with', 'our', 'own', 'travel', 'agency', 'solution', 'direct', 'access', 'and', 'ticket', 'for', 'the', 'large', 'agency', 'swat', 'overnight', 'delivery', 'of', 'southwest', 'produce', 'ticket', 'for', 'approximately', 'large', 'travel', 'agency', 'improve', 'access', 'to', 'ticket', 'by', 'mail', 'for', 'direct', 'customers', 'by', 'reduce', 'the', 'time', 'limit', 'from', 'seven', 'day', 'out', 'from', 'the', 'date', 'of', 'travel', 'to', 'three', 'day', 'and', 'ticketless', 'travel', 'which', 'eliminate', 'the', 'need', 'to', 'print', 'a', 'paper', 'ticket', 'altogether']


##### Export matched text to avoid repeating processing

In [272]:
# write the matched text to a .txt file for later use
with open(AIRLINE_MATCHED_TEXT_PATH, 'w') as f:
    for line in matched_sents:
        line = ' '.join(line) + '\n'
        line = line.encode('ascii', errors='ignore').decode('ascii')
        f.write(line)

In [302]:
# read matched text
with open(AIRLINE_MATCHED_TEXT_PATH, 'r') as f:
    matched_sents_full = [line for line in f.readlines()]
    matched_sents = [line.split() for line in matched_sents_full]

In [303]:
# store all matched senteces in a dataframe
matches_df = pd.DataFrame(matched_sents_full, columns=['sentences'])

# remove duplicates
matches_df = matches_df[~matches_df.duplicated()]

matches_df.head()

Unnamed: 0,sentences
0,rather than pay the fee demand by this crss we...
1,these expense include million of various profe...
2,included in this one time cost result from the...
3,the commitment fee be per annum\n
5,landing fee and other rental per available_sea...


### Use SpaCy part of speech (POS) to create phrases

In [306]:
# combine the matched sentence tokens and parse it with SpaCy
text = ' '.join(matched_sents[0])
text

'rather than pay the fee demand by this crss we respond quickly with our own travel agency solution direct access and ticket for the large agency swat overnight delivery of southwest produce ticket for approximately large travel agency improve access to ticket by mail for direct customers by reduce the time limit from seven day out from the date of travel to three day and ticketless travel which eliminate the need to print a paper ticket altogether'

##### Determine which NLP components can be disabled

In [282]:
def view_pos(doc, n_tokens=5):
    """ print SpaCy POS information about each token in a provided document """
    print('{:15} | {:10} | {:10} | {:30}'.format('TOKEN','POS','DEP_','LEFTS'))
    for token in doc[0:n_tokens]:
        print('{:15} | {:10} | {:10} | {:30}'.format(
            token.text, token.head.pos_,token.dep_, str([t.text for t in token.lefts])))

In [278]:
# observe which part of speech (pos) attributes are disabled by named entity recognition (ner)
pos_doc = nlp(text, disable=['ner'])
view_pos(pos_doc)

TOKEN           | POS        | DEP_       | LEFTS                         
rather          | ADP        | advmod     | []                            
than            | VERB       | advmod     | ['rather']                    
pay             | VERB       | ROOT       | ['than']                      
the             | NOUN       | det        | []                            
fee             | NOUN       | compound   | []                            


In [280]:
# observe which part of speech (pos) attributes are disabled by parser
pos_doc = nlp(text, disable=['ner','parser'])
view_pos(pos_doc)

TOKEN           | POS        | DEP_       | LEFTS                         
rather          | ADV        |            | []                            
than            | ADP        |            | []                            
pay             | VERB       |            | []                            
the             | DET        |            | []                            
fee             | NOUN       |            | []                            


In [281]:
# observe which part of speech (pos) attributes are disabled by tagger
pos_doc = nlp(text, disable=['ner','tagger'])
view_pos(pos_doc, n_tokens=10)

TOKEN           | POS        | DEP_       | LEFTS                         
rather          |            | advmod     | []                            
than            |            | advmod     | ['rather']                    
pay             |            | ROOT       | ['than']                      
the             |            | det        | []                            
fee             |            | compound   | []                            
demand          |            | dobj       | ['the', 'fee']                
by              |            | prep       | []                            
this            |            | det        | []                            
crss            |            | pobj       | ['this']                      
we              |            | nsubj      | []                            


In [271]:
# use explain to define any token.dep_ attributes
spacy.explain('dobj')

'direct object'

In [318]:
dependency_parsing_labels_url = 'https://spacy.io/api/annotation#dependency-parsing'
iframe = '<iframe src={} width=1000 height=400></iframe>'.format(dependency_parsing_labels_url)
HTML(iframe)

##### Extract phrases by identifying tokens describing an object

In [321]:
def create_pos_phrases(doc):

    doc = nlp(doc, disable=['ner','tagger'])
    
    phrases = [] 
    for token in doc:
        # find any objects (e.g. direct objects )
        if 'obj' in token.dep_:
            token_text = token.lemma_.lower()
            # find any dependent terms to the left of (preceeding) the object
            for left_term in [t.text for t in token.lefts]:
                phrase = '{}_{}'.format(left_term,token_text)
                phrases.append(phrase)
    
    return ' '.join(set(phrases))

print(create_pos_phrases(matched_sents_full[0]))

agency_swat the_date large_swat the_demand the_need agency_solution produce_ticket this_crss travel_agency time_limit direct_customer seven_day our_solution the_limit paper_ticket three_day large_agency the_swat own_solution a_ticket fee_demand


In [329]:
%%time

# apply the custom function to every element in the dataframe
matches_df['pos_phrases'] = matches_df.sentences.apply(create_pos_phrases)

Wall time: 43.5 s


In [331]:
matches_df.head()

Unnamed: 0,sentences,pos_phrases
0,rather than pay the fee demand by this crss we...,agency_swat the_date large_swat the_demand the...
1,these expense include million of various profe...,relocation_cost professional_million employee_...
2,included in this one time cost result from the...,relocation_cost employee_cost this_result dupl...
3,the commitment fee be per annum\n,
5,landing fee and other rental per available_sea...,airport_credit a_credit


##### Pandas Apply

apply is an efficient and fast approach to 'apply' a function to every element in a row. applymap does the same to every element in the entire dataframe (e.g. convert all ints to floats)

Example: https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_dataframes/

In [346]:
# create a small dataframe with example data
test_df = pd.DataFrame({'col1':range(0,3),'col2':range(3,6)})
test_df

Unnamed: 0,col1,col2
0,0,3
1,1,4
2,2,5


In [347]:
# apply a built-in function to each element in a column
test_df['col1'].apply(float)

0    0.0
1    1.0
2    2.0
Name: col1, dtype: float64

In [348]:
# apply a custom function to every element in a column
def add_five(row):
    return row + 5

test_df['col1'].apply(add_five)

0    5
1    6
2    7
Name: col1, dtype: int64

In [349]:
# apply an annonomous function to every element in a column
test_df['col1'].apply(lambda x: x+5)

0    5
1    6
2    7
Name: col1, dtype: int64

In [350]:
# apply a built-in function to every element in a dataframe 
test_df.applymap(float)  # applymap

Unnamed: 0,col1,col2
0,0.0,3.0
1,1.0,4.0
2,2.0,5.0


### Phrase (collocation) Detection

Phrase modeling is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our reviews and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is:

$$\frac{count(A\ B) - count_{min}}{count(A) * count(B)} * N > threshold$$

- $count(A)$ is the number of times token $A$ appears in the corpus
- $count(B)$ is the number of times token $B$ appears in the corpus
- $count(A\ B)$ is the number of times the tokens $A\ B$ appear in the corpus in order
- $N$ is the total size of the corpus vocabulary
- $count_{min}$ is a user-defined parameter to ensure that accepted phrases occur a minimum number of times
- $threshold$ is a user-defined parameter to control how strong of a relationship between two tokens the model requires before accepting them as a phrase

Once our phrase model has been trained on our corpus, we can apply it to new text. When our model encounters two tokens in new text that identifies as a phrase, it will merge the two into a single new token.

Phrase modeling is superficially similar to named entity detection in that you would expect named entities to become phrases in the model (so new york would become new_york). But you would also expect multi-word expressions that represent common concepts, but aren't specifically named entities (such as happy hour) to also become phrases in the model.

We turn to the indispensible gensim library to help us with phrase modeling — the Phrases class in particular.

SOURCE: https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb 

##### Scikit-learn API for Gensim

In [188]:
from gensim.sklearn_api.phrases import PhrasesTransformer

sklearn_phrases = PhrasesTransformer(min_count=3, threshold=3)
sklearn_phrases.fit(matched_sents)

PhrasesTransformer(delimiter=b'_', max_vocab_size=40000000, min_count=3,
          progress_per=10000, scoring='default', threshold=3)

In [200]:
# review phrase matches
phrases = []
for terms in sklearn_phrases.transform(matched_sents):
    for term in terms:
        if term.count('_') >= 2:
            phrases.append(term)
print(set(phrases))



{'federal_aviation_administration', 'available_seat_mile', 'per_available_seat_mile', 'available_seat_mile_increase', 'the_department_of_transportation', 'the_taxiway_safety_area', 'required_navigation_performance'}


##### Gensim API
A more complex API, though it is faster and has better integration with other gensim components (e.g. Phraser)

In [201]:
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser

In [202]:
from spacy.lang.en.stop_words import STOP_WORDS
common_terms = list(STOP_WORDS)

**common_terms:** optionnal list of “stop words” that won’t affect frequency count of expressions containing them.
    - The common_terms parameter add a way to give special treatment to common terms (aka stop words) such that their presence between two words won’t prevent bigram detection. It allows to detect expressions like “bank of america” or “eye of the beholder”.


In [203]:
phrases = Phrases(
      matched_sents
    , common_terms=common_terms
    , min_count=3
    , threshold=3
    , scoring='default'
)

phrases

<gensim.models.phrases.Phrases at 0x1590a88b4a8>

### Phrases Params

- **scoring:** specifies how potential phrases are scored for comparison to the threshold setting. scoring can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. Two built-in scoring functions are available by setting scoring to a string:

    - ‘default’: from “Efficient Estimaton of Word Representations in Vector Space” by Mikolov, et. al.: 
    
$$\frac{count(AB) - count_{min}}{count(A) * count(B)} * N > threshold$$
    

    - where N is the total vocabulary size.
    - Thus, it is easier to exceed the threshold when the two words occur together often or when the two words are rare (i.e. small product)

In [204]:
bigram = Phraser(phrases)

bigram

<gensim.models.phrases.Phraser at 0x1590a88b198>

The phrases object still contains all the source text in memory. A gensim Phraser will remove this extra data to become smaller and somewhat faster than using the full Phrases model. To determine what data to remove, the Phraser ues the  results of the source model’s min_count, threshold, and scoring settings. (You can tamper with those & create a new Phraser to try other values.)

SOURCE: https://radimrehurek.com/gensim/models/phrases.html

In [205]:
def print_phrases(phraser, text_stream, num_underscores=2):
    """ identify phrases from a text stream by searching for terms that
    are separated by underscores and include at least num_underscores
    """
    
    phrases = []
    for terms in phraser[text_stream]:
        for term in terms:
            if term.count('_') >= num_underscores:
                phrases.append(term)
    print(set(phrases))

In [206]:
print_phrases(bigram, matched_sents)

{'item_such_a_seat', 'available_seat_mile', 'conspire_with_delta', 'service_be_provide', 'fare_be_refundable', 'delay_of_much_than_minute', 'follow_the_acquisition', 'subject_to_a_maximum', 'cancellation_or_diversion', 'required_navigation_performance', 'reservation_be_make_at_little', 'fee_must_apply', 'item_such_a_\ufeff1', 'disclose_all_potential', 'continue_to_be_the_only_major', 'importance_to_southwest', 'cancel_a_pay', 'cost_be_then_allocate', 'serve_to_which_various_leasehold', 'damage_for_the_amount_of_\ufeff1', 'initiative_a_previously', 'decision_that_create', 'primarily_due_to_high', 'delta_the_consolidated', 'promote_the_company', 'imposition_of_a_\ufeff1', 'difference_in_airfare', 'rate_and_charge', 'result_in_a_low', 'website_and_iv', 'impact_the_company', 'hold_a_reservation', 'relief_against_a_broad', 'cost_for_safety', 'congress_may_consider', 'customers_on_behalf', 'report_much_information', 'change_in_law', 'fee_for_permanently', 'extension_or_transfer', 'require_th

### Tri-gram phrase model

We can place the text from the first phrase model into another Phrases object to create n-term phrase models. We can repear this process multiple times.

In [207]:
phrases = Phrases(bigram[matched_sents], common_terms=common_terms, min_count=5,threshold=5)
trigram = Phraser(phrases)

print_phrases(trigram, bigram[matched_sents], num_underscores=2)

{'attempt_to_monopolize_air_travel', 'item_such_a_seat', 'available_seat_mile', 'effective_july_and_ii_eliminate', 'capacity_and_price_decision', 'include_in_aircraft_rental', 'conjurer_the_only_major_airline_that_doe', 'flight_ii_refund_any_check', 'airtran_currently_charge', 'fare_be_refundable', 'customer_change_in_flight', 'follow_the_acquisition', 'required_navigation_performance', 'amounts_collect_from_passenger', 'passenger_be_allow_to_cancel_a_pay', 'generate_much_net_federal_revenue', 'continue_to_be_the_only_major', 'southwest_be_conjurer_the_only_major', 'anticompetitive_activity_a_good', 'confirmation_and_viii_passenger_must_be_promptly', 'purchase_of_miles_rewards', 'initial_complaint_seek_treble', 'serve_to_which_various_leasehold', 'initiative_a_previously', 'decision_that_create', 'primarily_due_to_high', 'pay_for_ancillary_service_if_a_flight', 'promote_the_company', 'relief_against_a_broad_range_of_allege', 'addition_to_treble_damage_for_the_amount_of_\ufeff1', 'diffe

In [208]:
for doc_num in [5]:
    print('DOC NUMBER: {}\n'.format(doc_num))
    print('ORIGINAL SENTENT: {}\n'.format(' '.join(matched_sents[doc_num])))
    print('BIGRAM: {}\n'.format(' '.join(bigram[matched_sents[doc_num]])))
    print('TRIGRAM: {}'.format(' '.join(trigram[bigram[matched_sents[doc_num]]])))

DOC NUMBER: 5

ORIGINAL SENTENT: landing fee and other rental per available_seat_mile increase percent in compare to which include a airport credit of million

BIGRAM: landing_fee and other rental_per_available_seat_mile increase_percent in compare to which include a airport credit of million

TRIGRAM: landing_fee_and_other_rental_per_available_seat_mile increase_percent in compare to which include a airport credit of million


#### Export Cleaned Text

In [231]:
# write the cleaned text to a new file for later use
with open(AIRLINE_CLEANED_TEXT_PATH, 'w') as f:
    for line in bigram[matched_sents]:
        line = ' '.join(line) + '\n'
        line = line.encode('ascii', errors='ignore').decode('ascii')
        f.write(line)

# Delete below?

### Review Phrase Model

In [147]:
def clean_text(doc):
    ents = nlp(doc.text).ents

    # Add named entities, but only if they are a compound of more than word.
    IGNORE_ENTS = ('QUANTITY','ORDINAL','CARDINAL','DATE'
                   ,'PERCENT','MONEY','TIME')
    ents = [ent for ent in ents if 
             (ent.label_ not in IGNORE_ENTS) and (len(ent) > 2)]
    
    # add underscores to combine words in entities
    ents = [str(ent).strip().replace(' ','_') for ent in ents]
    
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if 
           token.is_alpha and not token.is_stop]
    
    doc.extend([entity for entity in ents])
    
    return [str(term) for term in doc]

    
# combined terms before phrase_model (entities and/or noun chunks)
before_phrase = []
for sent in doc.sents:
    text = clean_text(sent)
    for term in text:
        if '_' in term:
            before_phrase.append(term)

print(set(before_phrase))

{'the_Southwest_Airlines_Rapid_Rewards_Visa_Signature_Card', 'Company’s_Rapid_Rewards', 'A-List_Preferred', 'the_Freedom_Award', 'The_Rapid_Rewards', 'a_Companion_Pass', 'A-List_and', 'the_Rapid_Rewards', 'Rapid_Rewards_Partners', 'the_Companion_Pass'}


In [148]:
def clean_text(doc):
    ents = nlp(doc.text).ents

    # Add named entities, but only if they are a compound of more than word.
    IGNORE_ENTS = ('QUANTITY','ORDINAL','CARDINAL','DATE'
                   ,'PERCENT','MONEY','TIME')
    ents = [ent for ent in ents if 
             (ent.label_ not in IGNORE_ENTS) and (len(ent) > 2)]
    
    # add underscores to combine words in entities
    ents = [str(ent).strip().replace(' ','_') for ent in ents]
    
    # clean text for phrase model
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc_ = [token.lemma_ for token in doc if token.is_alpha]
    phrase_text = [str(term) for term in doc_]
    sent = trigram[bigram[phrase_text]]
    phrases = []
    for term in sent:
        if '_' in term:
            phrases.append(term)

    # remove stops words - 
    # separate step as they are needed for the phrase model
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # add phrases and entities
    doc.extend([entity for entity in ents])
    clean_text = [str(term) for term in doc] + phrases

    return clean_text

# combined terms after phrase model
after_phrase = []
for sent in doc.sents:
    text = clean_text(sent)
    for term in text:
        if '_' in term:
            after_phrase.append(term)

print(set(after_phrase))

{'the_Southwest_Airlines_Rapid_Rewards_Visa_Signature_Card', 'Company’s_Rapid_Rewards', 'A-List_Preferred', 'the_Freedom_Award', 'The_Rapid_Rewards', 'a_Companion_Pass', 'A-List_and', 'way_trip', 'the_Rapid_Rewards', 'Rapid_Rewards_Partners', 'credit_card', 'the_Companion_Pass'}


In [357]:
ls ../raw_data/gensim/

airline.index      airline.mm         airline.mm.index   airline_dict.dict


In [227]:
# write the cleaned text to a new file for later use
with open(AIRLINE_CLEANED_TEXT_PATH, 'w') as f:
    for line in cleaned_text:
        line = ' '.join(line) + '\n'
        f.write(line)