# Phrase Models

###### Author: Alex Sherman | alsherman@deloitte.com

In [1]:
import spacy
import pandas as pd
from sqlalchemy import create_engine
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from collections import defaultdict
from spacy.lang.en.stop_words import STOP_WORDS
from IPython.core.display import display, HTML
from configparser import ConfigParser, ExtendedInterpolation

In [2]:
# configuration for data, acronyms, and gensim paths
config = ConfigParser(interpolation=ExtendedInterpolation())
config.read('../../config.ini')

DB_PATH = config['DATABASES']['PROJECT_DB_PATH']
AIRLINE_ACRONYMS_FILEPATH = config['NLP']['AIRLINE_ACRONYMS_FILEPATH']
GENSIM_DICTIONARY_PATH = config['NLP']['GENSIM_DICTIONARY_PATH']
GENSIM_CORPUS_PATH = config['NLP']['GENSIM_CORPUS_PATH']

#### Load data on airline fees

In [36]:
engine = create_engine(DB_PATH)
df = pd.read_sql("SELECT * FROM Sections", con=engine)

# the annual report from 1992 was scanned in poor quality
# and the text was not legible
df = df[df.filename != 'southwest-airlines-co_annual_report_1992.docx']

# filter to relevant sections
df = df[df['section_text'].str.contains('fee')][0:50]
df.head()

Unnamed: 0,section_id,filename,section_name,section_text,criteria,section_length
291,292,southwest-airlines-co_annual_report_1994.docx,DEPARTMENT OF TRANSPORTATION RANKINGS FOR 1994...,A multitude of challenges faced the People of ...,"<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",2849
297,298,southwest-airlines-co_annual_report_1994.docx,RESULTS OF OPERATIONS,1994 COMPARED WITH 1993 The Company's consolid...,"<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",13806
305,306,southwest-airlines-co_annual_report_1994.docx,ACQUISITION,"On December 31, 1993, Southwest exchanged 3,57...","<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",2141
308,309,southwest-airlines-co_annual_report_1994.docx,ACCRUED LIABILITIES (IN THOUSANDS) LONG-TERM D...,"On March 1, 1993, the Company redeemed the $10...","<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",1855
359,360,southwest-airlines-co_annual_report_1995.docx,SECRET NUMBER 1 STICK TO WHAT YOU’RE GOOD AT.,"Since 1971, Southwest Airlines has offered sin...","<, f, u, n, c, t, i, o, n, , s, t, y, l, e, ...",2566


In [37]:
text = ' '.join([section for section in df['section_text'].values[0:1]])
text[0:300]

'A multitude of challenges faced the People of Southwest Airlines in 1994. The mark of a true champion is the ability to “rise to the occasion” and meet challenges. We believe our Employees showed their true Southwest Spirit in 1994, accomplishing three- or four-fold what a normal year would  bring. '

### SpaCy - Preprocessing

In [38]:
%%time

# load spacy nlp model
# use 'en' if you don't have the lg model
nlp = spacy.load('en_core_web_lg')
doc = nlp(text,  disable=['ner'])

Wall time: 1min 28s


In [39]:
doc[0:100]

A multitude of challenges faced the People of Southwest Airlines in 1994. The mark of a true champion is the ability to “rise to the occasion” and meet challenges. We believe our Employees showed their true Southwest Spirit in 1994, accomplishing three- or four-fold what a normal year would  bring. First, Morris Air was acquired in December 1993. By October, an unprecedented seven new cities had been converted from Morris to the Southwest route system. Morris’ Utah Reservation Center was converted to Southwest in October. All

##### Text Preprocessing - Acronyms

In [40]:
# read csv with airline industry acronyms
airline_acronyms = pd.read_csv(AIRLINE_ACRONYMS_FILEPATH)
airline_acronyms.head()

Unnamed: 0,Acronym,Definition
0,A/C,Aircraft
1,A/G,Air to Ground
2,A/H,Altitude/Height
3,AAC,Mike Monroney Aeronautical Center
4,AAF,Army Air Field


### Exercise

To curate the acronyms, instead of manually reviewing the entire list, we could remove only the matches that do not make sense. However, we would reconsider this approach if the text data set was much larger.

1. Identify the acronyms in the document
2. Decide which acronyms to keep/remove

SOURCE: https://www.faa.gov/airports/resources/acronyms/

In [41]:
acronyms = {}

for ind, row in airline_acronyms.iterrows():
    acronym = row['Acronym'].lower()
    definition = row['Definition'].lower().strip().replace(' ','_')
    
    # ignore two character acronyms as they often match actual words
    # e.g. 'at' == 'air traffic'
    if len(acronym) > 2:
        acronyms[acronym] = definition

acronyms

{'a/c': 'aircraft',
 'a/g': 'air_to_ground',
 'a/h': 'altitude/height',
 'aac': 'mike_monroney_aeronautical_center',
 'aaf': 'army_air_field',
 'aai': 'arrival_aircraft_interval',
 'aap': 'advanced_automation_program',
 'aar': 'airport_acceptance_rate',
 'abdis': 'automated_data_interchange_system_service_b',
 'acais': 'air_carrier_activity_information_system',
 'acas': 'aircraft_collision_avoidance_system',
 'acc': 'area_control_center',
 'acct': 'accounting_records',
 'acd': 'automatic_call_distributor',
 'acdo': 'air_carrier_district_office',
 'acf': 'area_control_facility',
 'acfo': 'aircraft_certification_field_office',
 'acft': 'aircraft',
 'aci-na': 'airports_council_international_-_north_america',
 'acid': 'aircraft_identification',
 'acip': 'airport_capital_improvement_plan',
 'acls': 'automatic_carrier_landing_system',
 'aclt': 'actual_landing_time_calculated',
 'aco': 'office_of_airports_compliance_and_field_operations',
 'acrp': 'airport_cooperative_research_program',
 'ada

In [46]:
# review the acronyms
acronym_matches = []
for sent in doc.sents:
    
    for token in sent:
        token = token.text.lower()
        if token in acronyms:
            acronym_matches.append((token, acronyms[token]))

for match in set(acronym_matches):
    print(match)

A
multitude
of
challenges
faced
the
People
of
Southwest
Airlines
in
1994
.


In [43]:
acronyms_to_remove = ['basic','grade','self','cat','tsa','map','did','far']
for term in acronyms_to_remove:
    acronyms.pop(term)

In [15]:
acronyms = {}

# do not include acronyms that may have other common meanings
acronyms_to_remove = ['basic','grade','self','cat','tsa','map','did','far']

# add all acronyms to a dict
for ind, row in airline_acronyms.iterrows():
    acronym = row['Acronym'].lower()
    definition = row['Definition'].lower().strip().replace(' ','_')
    
    # ignore two character acronyms as they often match actual words
    # e.g. 'at' == 'air traffic'
    if (len(acronym) > 2) and (acronym not in acronyms_to_remove):
        acronyms[acronym] = definition

In [17]:
acronyms

{'a/c': 'aircraft',
 'a/g': 'air_to_ground',
 'a/h': 'altitude/height',
 'aac': 'mike_monroney_aeronautical_center',
 'aaf': 'army_air_field',
 'aai': 'arrival_aircraft_interval',
 'aap': 'advanced_automation_program',
 'aar': 'airport_acceptance_rate',
 'abdis': 'automated_data_interchange_system_service_b',
 'acais': 'air_carrier_activity_information_system',
 'acas': 'aircraft_collision_avoidance_system',
 'acc': 'area_control_center',
 'acct': 'accounting_records',
 'acd': 'automatic_call_distributor',
 'acdo': 'air_carrier_district_office',
 'acf': 'area_control_facility',
 'acfo': 'aircraft_certification_field_office',
 'acft': 'aircraft',
 'aci-na': 'airports_council_international_-_north_america',
 'acid': 'aircraft_identification',
 'acip': 'airport_capital_improvement_plan',
 'acls': 'automatic_carrier_landing_system',
 'aclt': 'actual_landing_time_calculated',
 'aco': 'office_of_airports_compliance_and_field_operations',
 'acrp': 'airport_cooperative_research_program',
 'ada

###### collect sentences about fees for phrase model

In [18]:
def collect_phrase_model_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start : end]

    # Keep only words, lemmatize tokens, remove punctuation
    sent = [str(token.lemma_).lower() 
            for token in span.sent if token.is_alpha]
    
    # replace acronyms
    sent = [acronyms[token] if token in acronyms else token for token in sent]

    matched_sents.append(sent)

matched_sents = []
pattern = [[{'POS': 'NOUN', 'OP': '+'},{'LOWER': 'fee'}]
         ,[{'POS': 'NOUN', 'OP': '+'},{'LOWER': 'fees'}]]
matcher = Matcher(nlp.vocab)
matcher.add('fees', collect_phrase_model_sents, *pattern)
matches = matcher(doc)

In [19]:
matched_sents

[]

### Phrase Model

Phrase modeling is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our reviews and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is:

$$\frac{count(A\ B) - count_{min}}{count(A) * count(B)} * N > threshold$$

- $count(A)$ is the number of times token $A$ appears in the corpus
- $count(B)$ is the number of times token $B$ appears in the corpus
- $count(A\ B)$ is the number of times the tokens $A\ B$ appear in the corpus in order
- $N$ is the total size of the corpus vocabulary
- $count_{min}$ is a user-defined parameter to ensure that accepted phrases occur a minimum number of times
- $threshold$ is a user-defined parameter to control how strong of a relationship between two tokens the model requires before accepting them as a phrase

Once our phrase model has been trained on our corpus, we can apply it to new text. When our model encounters two tokens in new text that identifies as a phrase, it will merge the two into a single new token.

Phrase modeling is superficially similar to named entity detection in that you would expect named entities to become phrases in the model (so new york would become new_york). But you would also expect multi-word expressions that represent common concepts, but aren't specifically named entities (such as happy hour) to also become phrases in the model.

We turn to the indispensible gensim library to help us with phrase modeling — the Phrases class in particular.

SOURCE: https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb 

In [41]:
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser

In [49]:
common_terms = list(STOP_WORDS)
phrases = Phrases(matched_sents
                  , common_terms=common_terms
                  , min_count=5
                  , threshold=5)

### Phrases Params

- **scoring:** specifies how potential phrases are scored for comparison to the threshold setting. scoring can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. Two built-in scoring functions are available by setting scoring to a string:

    - ‘default’: from “Efficient Estimaton of Word Representations in Vector Space” by Mikolov, et. al.: 
    
$$\frac{count(AB) - count_{min}}{count(A) * count(B)} * N > threshold$$
    
    - where N is the total vocabulary size.
    - Thus, it is easier to exceed the threshold when the two words occur together often or when the two words are rare (i.e. small product)




- **common_terms:** optionnal list of “stop words” that won’t affect frequency count of expressions containing them.
    - The common_terms parameter add a way to give special treatment to common terms (aka stop words) such that their presence between two words won’t prevent bigram detection. It allows to detect expressions like “bank of america” or “eye of the beholder”.


In [50]:
bigram = Phraser(phrases)

The phrases object still contains all the source text in memory. A gensim Phraser will remove this extra data to become smaller and somewhat faster than using the full Phrases model. To determine what data to remove, the Phraser ues the  results of the source model’s min_count, threshold, and scoring settings. (You can tamper with those & create a new Phraser to try other values.)

SOURCE: https://radimrehurek.com/gensim/models/phrases.html

In [51]:
def print_phrases(phraser, text_stream, num_underscores=1):
    phrases = []
    for terms in phraser[text_stream]:
        for term in terms:
            if term.count('_') >= num_underscores:
                phrases.append(term)
    print(set(phrases))

In [53]:
print_phrases(bigram, matched_sents)

{'require_airline', 'flight_cancel', 'airtran_and_to_delta', 'luggage_iii', 'required_navigation_performance', 'deny_board', 'allege_anticompetitive', 'available_seat_mile', 'optional_service', 'difference_in_airfare', 'ancillary_service', 'sherman_act', 'department_of_transportation', 'permanently_lose', 'damage_for_the_amount_of_first_baggage', 'rental_expense', 'rules_also_require', 'prominently_disclose', 'check_bag', 'complaint_seek', 'injunctive_relief', 'violation_of_section', 'refund_any_check', 'bump_from_flight', 'passenger_protection', 'broad_range', 'consolidated_amended', 'unable_to_take_advantage', 'section_of_the_sherman', '-pron-_website', 'charge_a_change', 'passenger_be_allow', 'approximately_percent', 'addition_to_treble'}


### Tri-gram phrase model

We can place the text from the first phrase model into another Phrases object to create n-term phrase models. We can repear this process multiple times.

In [55]:
phrases = Phrases(bigram[matched_sents], common_terms=common_terms, min_count=5,threshold=5)
trigram = Phraser(phrases)

print_phrases(trigram, bigram[matched_sents], num_underscores=2)

{'refund_any_check_bag', 'flight_cancel_or_oversell', 'allege_anticompetitive_activity', 'injunctive_relief_against_a_broad_range', 'required_navigation_performance', 'passenger_protection_rules_also_require', 'consolidated_amended_complaint_seek', 'permanently_lose_luggage_iii', 'available_seat_mile', 'difference_in_airfare', 'department_of_transportation', 'pay_for_ancillary_service', 'addition_to_treble_damage_for_the_amount_of_first_baggage', 'bump_from_flight_ii', 'passenger_be_unable_to_take_advantage', 'bump_from_flight', 'optional_service_on_-pron-_website', 'pay_to_airtran_and_to_delta', 'section_of_the_sherman', 'violation_of_section_of_the_sherman_act', 'prominently_disclose_all_potential', 'deny_board_compensation', 'charge_a_change', 'passenger_be_allow'}


In [59]:
for doc_num in [6]:
    print('DOC NUMBER: {}\n'.format(doc_num))
    print('ORIGINAL SENTENT: {}\n'.format(' '.join(matched_sents[doc_num])))
    print('BIGRAM: {}\n'.format(' '.join(bigram[matched_sents[doc_num]])))
    print('TRIGRAM: {}'.format(' '.join(trigram[bigram[matched_sents[doc_num]]])))

DOC NUMBER: 6

ORIGINAL SENTENT: in august the department_of_transportation implement new rule expand the passenger protection rule by among other thing i increase the maximum deny board compensation airline must pay to passenger bump from flight from to ii require airline to refund any check bag fee for permanently lose luggage iii require airline to prominently disclose all potential fee for optional service on -pron- website and iv require airline to refund passenger fee pay for ancillary service if a flight cancel or oversell and a passenger be unable to take advantage of such service

BIGRAM: in august the department_of_transportation implement new rule expand the passenger_protection rule by among other thing i increase the maximum deny_board compensation airline must pay to passenger bump_from_flight from to ii require_airline to refund_any_check bag fee for permanently_lose luggage_iii require_airline to prominently_disclose all potential fee for optional_service on -pron-_webs

### Review Phrase Model

In [61]:
def clean_text(doc):
    ents = nlp(doc.text).ents

    # Add named entities, but only if they are a compound of more than word.
    IGNORE_ENTS = ('QUANTITY','ORDINAL','CARDINAL','DATE'
                   ,'PERCENT','MONEY','TIME')
    ents = [ent for ent in ents if 
             (ent.label_ not in IGNORE_ENTS) and (len(ent) > 2)]
    
    # add underscores to combine words in entities
    ents = [str(ent).strip().replace(' ','_') for ent in ents]
    
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if 
           token.is_alpha and not token.is_stop]
    
    doc.extend([entity for entity in ents])
    
    return [str(term) for term in doc]

    
# combined terms before phrase_model (entities and/or noun chunks)
before_phrase = []
for sent in doc.sents:
    text = clean_text(sent)
    for term in text:
        if '_' in term:
            before_phrase.append(term)

print(set(before_phrase))

{'Terminal_1_Modernization_Project', 'A-List_Preferred”_Rapid_Rewards_Members_booking_on', 'The_U.S._Department_of_Transportation', 'Initiatives_-_Business_Traveler_Amenities', 'The_Passenger_Protection_Rules', 'the_FAA_Modernization_and_Reform_Act', 'New_York_LaGuardia_Airport', 'Wanna_Get_Away', 'Facilities_Maintenance_Technicians', 'the_Wartime_Act', 'the_Company,_the_Company', 'the_Passenger_Protection_Rule', 'the_U.S._Congress', 'Amadeus_IT_Group', 'Hollywood_International_Airport', 'the_Northern_District', 'AirTran_A+_Rewards', 'Consolidated_Amended_Complaint', 'the_Customer_Service_Agents', '2013_the_Company', 'the_Department_of_Homeland_Security', 'Washington_Reagan_National_Airport', 'the_FAA_Modernization_and_Reform__Act', 'the_United_States_District_Court', 'Occupational_Safety_and_Health_Administration_and_Food_and_Drug_Administration', 'Hartsfield-_Jackson_Atlanta_International_Airport', '“Business_Select', 'A+_Miles_Rewards', 'the_Consolidated_Amended_Complaint', 'the_Air

In [62]:
def clean_text(doc):
    ents = nlp(doc.text).ents

    # Add named entities, but only if they are a compound of more than word.
    IGNORE_ENTS = ('QUANTITY','ORDINAL','CARDINAL','DATE'
                   ,'PERCENT','MONEY','TIME')
    ents = [ent for ent in ents if 
             (ent.label_ not in IGNORE_ENTS) and (len(ent) > 2)]
    
    # add underscores to combine words in entities
    ents = [str(ent).strip().replace(' ','_') for ent in ents]
    
    # clean text for phrase model
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc_ = [token.lemma_ for token in doc if token.is_alpha]
    phrase_text = [str(term) for term in doc_]
    sent = trigram[bigram[phrase_text]]
    phrases = []
    for term in sent:
        if '_' in term:
            phrases.append(term)

    # remove stops words - 
    # separate step as they are needed for the phrase model
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # add phrases and entities
    doc.extend([entity for entity in ents])
    clean_text = [str(term) for term in doc] + phrases

    return clean_text

# combined terms after phrase model
after_phrase = []
for sent in doc.sents:
    text = clean_text(sent)
    for term in text:
        if '_' in term:
            after_phrase.append(term)

print(set(after_phrase))

{'Terminal_1_Modernization_Project', 'A-List_Preferred”_Rapid_Rewards_Members_booking_on', 'The_U.S._Department_of_Transportation', 'Initiatives_-_Business_Traveler_Amenities', 'The_Passenger_Protection_Rules', 'the_FAA_Modernization_and_Reform_Act', 'New_York_LaGuardia_Airport', 'Wanna_Get_Away', 'pay_for_ancillary_service', 'Facilities_Maintenance_Technicians', 'the_Wartime_Act', 'the_Company,_the_Company', 'addition_to_treble_damage_for_the_amount_of_first_baggage', 'the_U.S._Congress', 'passenger_be_unable_to_take_advantage', 'Amadeus_IT_Group', 'Hollywood_International_Airport', 'the_Northern_District', 'bump_from_flight', 'AirTran_A+_Rewards', 'Consolidated_Amended_Complaint', 'the_Customer_Service_Agents', '2013_the_Company', 'the_Department_of_Homeland_Security', 'Washington_Reagan_National_Airport', 'the_FAA_Modernization_and_Reform__Act', 'the_United_States_District_Court', 'Occupational_Safety_and_Health_Administration_and_Food_and_Drug_Administration', 'Hartsfield-_Jackson_

In [298]:
cleaned_text = []
for sent in matched_sents:
    text = clean_text(sent)
    cleaned_text.append(text)

print(cleaned_text[0:2])

[['no', 'show', 'policy', 'fund', 'apply', 'future', 'travel', 'southwest', 'change', 'fee', 'future_travel'], ['for', 'example', 'company', 'transfarencysm', 'campaign', 'emphasize', 'southwest', 'approach', 'treat', 'customer', 'fairly', 'honestly', 'respectfully', 'low', 'fare', 'unexpected', 'bag', 'fee', 'change', 'fee', 'hidden', 'fee', 'low_fare', 'bag_fee']]


In [357]:
ls ../raw_data/gensim/

airline.index      airline.mm         airline.mm.index   airline_dict.dict


In [359]:
# write the cleaned text to a new file for later use
with open(AIRLINE_CLEANED_TEXT_PATH, 'w') as f:
    for line in cleaned_text:
        line = ' '.join(line) + '\n'
        f.write(line)