In [1]:
import spacy
import pandas as pd
import numpy as np
import difflib
from itertools import combinations, chain, groupby
from operator import itemgetter

In [2]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [4]:
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
import pickle

## Load Previously Generated Rule Sets

In [18]:
with open('qterms.data', 'rb') as filehandle:
    qterms = pickle.load(filehandle)

In [19]:
with open('whatrules2.data', 'rb') as filehandle:
    what_rules2 = pickle.load(filehandle)

# Rule Creation Process

Previously, independent work had been done to manually extract the sections of text on a webinar page that define "What the webinar is about" or "What you will learn".  Additionally, a pre-defined set of search queue terms was developed to use as a marker that would help differntiate this section from the rest of the text.

The biggest challenge faced so far is that while the POS tags and rules can be created as described below, it may be missing text some of the time or it may be returning too much as it over generalizes certain sections.

1. **Search the Researcher outputs for exact string matches to the existing queues. If found, return the rule.**

### For Example:

Original Text:

**Join us to learn how to:**

- **Create a multi-touch promotion campaign that includes email and other paid promotions**
- **Design and execute a focused PR strategy that will get the attention of attendees and analysts**
- **Develop other creative ways to get the attention of your customers and prospects**


2. **The following reference queues would be extracted:**

['Join us to learn',
 'to learn',
 'Join us to learn how to',
 'Join us to learn how',
 'Join us']
 
 
3. **These would then be converted to the POS rules based on their original place in the text**
 
 [('VERB', 'VB', 'ROOT'),
  ('PRON', 'PRP', 'dobj'),
  ('PART', 'TO', 'aux'),
  ('VERB', 'VB', 'xcomp')],
  
 [('PART', 'TO', 'aux'), ('VERB', 'VB', 'xcomp')],
 
 [('VERB', 'VB', 'ROOT'),
  ('PRON', 'PRP', 'dobj'),
  ('PART', 'TO', 'aux'),
  ('VERB', 'VB', 'xcomp'),
  ('ADV', 'WRB', 'advmod'),
  ('PART', 'TO', 'aux')],
  
 [('VERB', 'VB', 'ROOT'),
  ('PRON', 'PRP', 'dobj'),
  ('PART', 'TO', 'aux'),
  ('VERB', 'VB', 'xcomp'),
  ('ADV', 'WRB', 'advmod')],
  
 [('VERB', 'VB', 'ROOT'), ('PRON', 'PRP', 'dobj')]

# Using Spacy Phrase Matching

For a more refined and specific approach we used Spacy's Phrase Matching utility. This did a great job of identifying more specific text. But did not provide much in the way of finding generalized patterns.

In [21]:
#qterms = [q for q in queWhat.key_queue if 1 < len(q.split(' ')) < 8]
#new_terms = ['will walk you through', 'In this presentation', 'Attend this webinar and learn']
#qterms.extend(new_terms)

In [312]:
# Save qterms for spacy phrase matching
#with open('qterms.data', 'wb') as filehandle:
    # store the data as binary data stream
#    pickle.dump(qterms, filehandle)
    

In [None]:
nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)
terms = qterms
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)

In [None]:
# Function to return the sentence(s) containing search matches

def spacy_phrase_match(text):
    doc = nlp(text)
    matches = matcher(doc)
    matching_sents = []
    for match_id, start, end in matches:
        sent_span = doc[start:end].sent
        matching_sents.append(sent_span.text)
    
    return pd.unique(matching_sents)

# Functions

In [13]:
# Match rule patterns in text to POS rules
def get_qphrases_text(rules, text):
    fnd = []
    pat = get_pattern_tupples(text)
    for w in rules:
        rm = rule_matches(pat, w)
        if rm:
            fnd.append(text)
    
    return(fnd)

In [16]:
# Use Spacy Matcher to refine the output
def extract_relevant_chunk(about_text_sentences):
    ats = about_text_sentences.split('\n')
    refined_ats = [a for a in ats if spacy_phrase_match(a).size > 0]
    
    return '\n'.join(refined_ats)

In [17]:
def symmetric_difference(a, b):
    return list({*a} ^ {*b})

def symmetric_same(a,b):
    return list({*a} & {*b})

In [18]:
# Measure the similarity of two bodies of text
def find_sim(a, b):
    nlpa = nlp(str(a))
    nlpb = nlp(str(b))
    return(nlpa.similarity(nlpb))

In [5]:
# This function is used to create POS tupples with Spacy
def get_pattern_tupples(text):
    doc = nlp(text)
    pattern = [(t.pos_, t.tag_, t.dep_) for t in doc]
    
    return pattern

In [7]:
def find_matches(a, b):
    sm = difflib.SequenceMatcher(None, a, b).get_matching_blocks()
    seq_match = [tuple(a[s[0]:s[0]+s[2]]) for s in sm if s[2] > 1]
    return(seq_match)

In [8]:
# A function used to determine if a sequence of POS tupples in a document match the 
# sequence of tupples in the rules
def rule_matches(a, b):
    sm = difflib.SequenceMatcher(None, a, b).get_matching_blocks()
    seq_match = [[s[0],s[0]+s[2]] for s in sm if s[2] == len(b)]
    return(seq_match)

In [9]:
# Extract "What someone will learn about" in the text, if in listed form

def extract_listed_about(text, rules):
    ulist = text.split('\n')
    ulist = [u for u in ulist if u]
    list_idxs = []
    for i, x in enumerate(ulist):
        if x[:2] == '- ':
            list_idxs.append(i)
    
    list_groups = []
    for k, g in groupby(enumerate(list_idxs), lambda x: x[0]-x[1]):
         list_groups.append(list(map(itemgetter(1), g)))

    webinar_list_text = []
    new_list_idxs = []
    for g in list_groups:
        if g[0] != 0:
            a = min(g)-1
            list_text = '\n'.join(ulist[a:max(g)+1])
            list_idxs.append(a)
        else:
            list_text = ''
        webinar_list_text.append(list_text)
        new_list_idxs.append(g)  

    wlt = [get_qphrases_text(rules, s) for s in webinar_list_text]
    wlt = pd.unique(list(chain.from_iterable(wlt)))
    spacy_lt = [spacy_phrase_match(s) for s in webinar_list_text]
    spacy_lt = pd.unique(list(chain.from_iterable(spacy_lt)))
    if wlt.size > 1:
        list_max = np.argmax([find_sim(w, spacy_lt) for w in wlt])
        wlt = wlt[list_max]
    elif wlt.size != 0: 
        wlt = wlt[0]
    return wlt

In [10]:
# Extract "What someone will learn about" in the text, if in sentence form

def extract_sentence_about(text, listed_about, rules):
    if type(listed_about) == str:
        listed_about = listed_about.split('\n')
    ulist = text.split('\n')
    ulist = [u for u in ulist if u]
    if len(listed_about) == 0:  # Listed 'You will learn' doesn't exist
        nlt = [get_qphrases_text(rules, n) for n in ulist]
        nlt = pd.unique(list(chain.from_iterable(nlt)))
        sp_nlt = [a for a in nlt if spacy_phrase_match(a).size > 0]
    else:  # Listed 'You will learn' does exist 
        ss = symmetric_same(listed_about, ulist)
        sidxs = []
        for s in ss:
            sidxs.append(ulist.index(s))
        ulist = [u for i, u in enumerate(ulist) if i < min(sidxs)]
        nlt = [get_qphrases_text(rules, n) for n in ulist]
        nlt = pd.unique(list(chain.from_iterable(nlt)))
        sp_nlt = [a for a in nlt if spacy_phrase_match(a).size > 0]        
    return sp_nlt

In [11]:
# Find all remaining text that does not match the "What you will learn pattern"

def extract_introduction(text, listed_about, sentence_about):
    if type(listed_about) == str:
        listed_about = listed_about.split('\n')
    if type(sentence_about) == str:
        sentence_about = sentence_about.split('\n')
    ulist = text.split('\n')
    ulist = [u for u in ulist if u]
# Listed 'You will learn' does exist and Sentence does not
    if len(listed_about) != 0 and len(sentence_about) == 0:  
        ss = symmetric_same(listed_about, ulist)
        sidxs = []
        for s in ss:
            sidxs.append(ulist.index(s))
        ulist = [u for i, u in enumerate(ulist) if i < min(sidxs)]
        intro = [x for x in ulist if x[0] != '#']
        intro = [x for x in intro if x[:2] != '- ']
# Listed 'You will learn' does not exist and Sentence does         
    elif len(listed_about) == 0 and len(sentence_about) != 0: 
        ss = symmetric_same(sentence_about, ulist)
        sidxs = []
        for s in ss:
            sidxs.append(ulist.index(s))
        ulist = [u for i, u in enumerate(ulist) if i < min(sidxs)]
        intro = [x for x in ulist if x[0] != '#'] 
        intro = [x for x in intro if x[:2] != '- ']
# Both Listed and Sentence exist        
    elif len(listed_about) != 0 and len(sentence_about) != 0: 
        ss = symmetric_same(sentence_about, ulist)
        sidxs = []
        for s in ss:
            sidxs.append(ulist.index(s))
        ulist = [u for i, u in enumerate(ulist) if i < min(sidxs)]
        intro = [x for x in ulist if x[0] != '#']
        intro = [x for x in intro if x[:2] != '- ']
# Neither exist:
    elif len(listed_about) == 0 and len(sentence_about) == 0:
        intro = []
    
    return pd.unique(intro)

In [770]:
# A function written for our tentative interface

def extract_sections(text):
    ela = extract_listed_about(text, rules=what_rules2)
    est = extract_sentence_about(text, ela, rules=what_rules2)
    intro = extract_introduction(text, ela, est)
    
    l_list = ela.split('\n')
    split_text = text.split('\n')

    ela = split_text[split_text.index(l_list[0]):split_text.index(l_list[-1])+1] if len(l_list) != 0 else ''
    intro = split_text[split_text.index(intro[0]):split_text.index(intro[-1])+1] if len(intro) != 0 else ''
    est = split_text[split_text.index(est[0]):split_text.index(est[-1])+1] if len(est) != 0 else ''
    
    extract_output = {
        'introduction': '\n'.join(intro),
        'learn_about_sentence': '\n'.join(est),
        'learn_about_list': '\n'.join(ela)
    }
    
    with open('cware_extract_output.json', 'w') as json_file:
        json.dump(extract_output, json_file)

In [9]:
# Used to create initial POS rules based on pre-defined text queues

def create_rules(q_terms, text_to_search):
    rules = []
    pat, t = get_pattern_tupples(text_to_search)
    for q in q_terms:
        rm = rule_matches(t, q)
        if rm:
            rm = list(chain.from_iterable(rm))
            rules.append(pat[rm[0]:rm[1]])
    
    return rules

### General Process for Extracting Sections

1. Split Text by New Line
2. Identify the "What You'll Learn" Section
3. Identify the Introduction Section (always above the "What You'll Learn" section)
4. Identify the speaker information

## Example

In [15]:
df = pd.read_csv('sample_webinar_text.csv')

In [732]:
# Original Text
t = 31
print(df.useable_text[t])


- Homepage
- Uncategorized
- Feb 18 Webinar: Data Governance Reality Check# Feb 18 Webinar: Data Governance Reality Check

September 20, 2019

#### DATEFebruary 18, 2020, This webinar has passed. The recording will be made available On Demand within the next two US business days.

#### TIME: 2 PM Eastern / 11 AM Pacific

#### PRICE: Free to all attendees

# This webinar is sponsored by:

## About the Webinar

It’s been almost two years since the General Data Protection Regulation shook up how organizations manage data security and privacy, ushering in a new focus on Data Governance. This complex but critical practice still has most enterprises grappling to master it for a myriad of reasons.

In this webinar, we’ll examine how Data Governance attitudes and practices continue to evolve and discuss what new research reveals as the most predominant challenges. We’ll delve into technology trends, including how adding certain capabilities will benefit your organization in terms of data asse

In [731]:
# Parsed Sections

ela = extract_listed_about(df.useable_text[t], what_rules2)
est = extract_sentence_about(df.useable_text[t], what_rules2, ela)
intro = extract_introduction(df.useable_text[t], ela, est)
#print(intro, est, '\n\n', ela)
print('\n'.join(intro), '\n\n', '\n'.join(est), '\n\n', ela)

September 20, 2019 

 #### DATEFebruary 18, 2020, This webinar has passed. The recording will be made available On Demand within the next two US business days.
# This webinar is sponsored by:
In this webinar, we’ll examine how Data Governance attitudes and practices continue to evolve and discuss what new research reveals as the most predominant challenges. We’ll delve into technology trends, including how adding certain capabilities will benefit your organization in terms of data asset availability, quality, and usability, including data consumer literacy and confidence. 

 When you attend this webinar, you will learn about:
- The requirements for a successful and sustainable Data Governance program
- Increasing confidence in data analytics for faster speed to insights
- How to automate data preparation and intelligence and where to startAll registrants will receive a copy of the new erwin white paper, The 2020 State of Data Governance and Automation, which is based on a recent survey

# Rule Creation Process

In [165]:
queues = pd.read_csv('content_queues.csv')

In [166]:
queWhat = queues[queues['type'] == 'What'].reset_index()

In [230]:
udf = pd.read_csv('bart_output.csv')

In [231]:
utext = udf.useable_text
udesc = [x for x in udf['$description'] if x != 'None']
utext = [t.replace('\n\n\n', '') for t in utext]

In [232]:
text_to_remove = ['View cookie settings', 'We use cookies to','Cookies Policy', 'This website uses cookies to']
utext = [remove_unwanted_text(u, text_to_remove) for u in utext]

In [27]:
q_terms = []
q_pats = []
for q in queWhat.key_queue:
    p, terms = get_pattern_tupples(q)
    q_terms.append(terms)
    q_pats.append(p)

In [28]:
what_rules = []
for fb_post in df.FB_Extract:
    what_rules.extend(create_rules(q_terms, fb_post))

In [29]:
new_rules = []
for ud in udesc:
    new_rules.extend(create_rules(q_terms, ud))
what_rules.extend(new_rules)
what_rules = list(np.unique(what_rules))

# Rules that don't have single tokens
what_rules2 = [r for r in what_rules if len(r) > 1]

In [313]:
# Save what rules
with open('whatrules2.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(what_rules2, filehandle)