# Information Extrction 

spacy is used as key library

# 1)- Importing key Modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

# 2)- Extracting sentiment from sentence

In [4]:
text="In my opinion, Facebook was great organization. Now, Facebook is quite good."

In [5]:
print(text)

In my opinion, Facebook was great organization. Now, Facebook is quite good.


In [6]:
matcher = Matcher(nlp.vocab)

In [7]:
matched_sents = []

In [8]:
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]

In [9]:
spacy.explain('ADJ')

'adjective'

In [10]:
spacy.explain('ADV')

'adverb'

- So, we have our key entity i.e Facebook. 

- Then we have a verb which is expressed by lemma

- After this common pattern, we expect adjective such as good, bad etc

- Adverb is for as a qualifier as if some one wants to add more meaning. Very good, quite bad

In [11]:
def callback_method_fb(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    match_ents = [{
        'start':span.start_char - sent.start_char,
        'end': span.end_char - sent.start_char,
        'label': 'MATCH'
    }]
    
    matched_sents.append({'text': sent.text, 'ents':match_ents})

In [12]:
matcher.add("fb", callback_method_fb, pattern)

In [13]:
doc = nlp(text)

In [14]:
doc

In my opinion, Facebook was great organization. Now, Facebook is quite good.

In [15]:
matches = matcher(doc)
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 11, 15)]

In [16]:
matched_sents

[{'text': 'In my opinion, Facebook was great organization.',
  'ents': [{'start': 15, 'end': 33, 'label': 'MATCH'}]},
 {'text': 'Now, Facebook is quite good.',
  'ents': [{'start': 5, 'end': 27, 'label': 'MATCH'}]}]

In [17]:
displacy.render(matched_sents, style='ent', manual = True)

# 3)-Extracting Phone numbers

In [18]:
Text="Our new office phone number is (49) 403 4560-7890"

In [19]:
pattern = [{"ORTH": "("}, {"SHAPE": "dd"}, {"ORTH": ")"},{"SHAPE": "ddd"}, {"SHAPE": "dddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [20]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber", None, pattern)

In [21]:
doc=nlp(Text)

In [22]:
print(doc)

Our new office phone number is (49) 403 4560-7890


In [23]:
print([t.text for t in doc])

['Our', 'new', 'office', 'phone', 'number', 'is', '(', '49', ')', '403', '4560', '-', '7890']


In [24]:
matches = matcher(doc)
matches

[(7978097794922043545, 6, 13)]

In [25]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

(49) 403 4560-7890


### Trying new pattern

In [26]:
Text="Old office number was 49 403 4560-7890. Our new office has moved to Germany and Phone is + 49 123 4567-8901"

In [27]:
pattern = [{"ORTH": "+", "OP": "?"}, {"SHAPE": "dd"},{"SHAPE": "ddd"}, {"SHAPE": "dddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [28]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber", None, pattern)

In [29]:
doc=nlp(Text)

In [30]:
print(doc)

Old office number was 49 403 4560-7890. Our new office has moved to Germany and Phone is + 49 123 4567-8901


In [31]:
matches = matcher(doc)
matches

[(7978097794922043545, 4, 9),
 (7978097794922043545, 20, 26),
 (7978097794922043545, 21, 26)]

In [32]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

49 403 4560-7890
+ 49 123 4567-8901
49 123 4567-8901


# 4)-Email Address Matching

In [33]:
text = "Email us at firstname@gmail.com and lastname@gmail.com"

In [34]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [35]:
matcher = Matcher(nlp.vocab)
matcher.add("Email", None, pattern)

In [36]:
doc = nlp(text)

In [37]:
matches = matcher(doc)
matches

[(11010771136823990775, 3, 4), (11010771136823990775, 5, 6)]

In [38]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

firstname@gmail.com
lastname@gmail.com


# 5)-Hashtags and emoji on social media

In [39]:
my_tweet="Save Planet 😀 #PeaceForALL"

In [40]:
# list of positive emojis
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]

# Negative emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  

In [41]:
# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [42]:
pos_patterns

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [43]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == 'HAPPY':
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == 'SAD':
        doc.sentiment -= 0.1

In [44]:
matcher = Matcher(nlp.vocab)

In [45]:
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matcher.add('SAD', label_sentiment, *neg_patterns)

In [46]:
matcher.add('HASHTAG', None, [{'TEXT': '#'}, {'IS_ASCII': True}])

In [47]:
doc1 = nlp("Save Planet 😀 #PeaceForALL")

In [48]:
matches = matcher(doc1)

In [49]:
for match_id, start, end in matches:
    string_id = doc1.vocab.strings[match_id]  # Look up string ID
    span = doc1[start:end]
    print(string_id, span.text)

HAPPY 😀
HASHTAG #PeaceForALL


In [50]:
doc2=nlp("One of the coldest day 😢 #WorstChristmasDay")

In [51]:
matches = matcher(doc2)

In [52]:
matches

[(17412815195067373849, 5, 6), (16536914698459818706, 6, 8)]

In [53]:
for match_id, start, end in matches:
    string_id = doc2.vocab.strings[match_id]  # Look up string ID
    span = doc2[start:end]
    print(string_id, span.text)

SAD 😢
HASHTAG #WorstChristmasDay
