# Semantic Processing

This tutorial features:
* part of speech tagging
* named entity recognition
    * address processing
    * phone number processing
* text classification
* emotional valence / sentiment analysis
* word embeddings


In [2]:
# Part of speech tagging

#Reference: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
sentence = "This isn't a dogs and ponies show, after all!  We are barely making any molla..." 

#Spacy
from spacy.en import English
parser = English()
parsedData = parser(sentence)
print("Sentence parsed by Spacy")
# Let's look at the part of speech tags of the first sentence
for span in parsedData.sents:
    sent = [parsedData[i] for i in range(span.start, span.end)]
    break

for token in sent:
    print(token.orth_, token.pos_)

Sentence parsed by Spacy
This DET
is VERB
n't ADV
a DET
dogs NOUN
and CCONJ
ponies NOUN
show VERB
, PUNCT
after ADV
all ADV
! PUNCT
  SPACE
We PRON
are VERB
barely ADV
making VERB
any DET
molla NOUN
... PUNCT


In [3]:
#reference: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
from spacy.en import English
parser = English()

# Let's look at the named entities of this example:
example = "Apple's stocks dropped dramatically after the death of Steve Jobs in October."
parsedEx = parser(example)
for token in parsedEx:
    print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")

print("-------------- entities only ---------------")
# if you just want the entities and nothing else, you can do access the parsed examples "ents" property like this:
ents = list(parsedEx.ents)
for entity in ents:
    print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))


Apple ORG
's (not an entity)
stocks (not an entity)
dropped (not an entity)
dramatically (not an entity)
after (not an entity)
the (not an entity)
death (not an entity)
of (not an entity)
Steve PERSON
Jobs PERSON
in (not an entity)
October DATE
. (not an entity)
-------------- entities only ---------------
380 ORG Apple
377 PERSON Steve Jobs
387 DATE October


In [None]:
# Address and phone number parsing:

# https://github.com/EricSchles/investigator/blob/master/app/text_parser.py

In [9]:
# Text Classification

# Reference: https://github.com/EricSchles/csvconf_talk/blob/master/scraping_craigslist/app/text_classify.py

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

#http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

def train_classifier(text,labels):
    #http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#parameter-tuning-using-grid-search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf__alpha': (1e-2, 1e-3),}
    
    text_clf = Pipeline([('vect', CountVectorizer(tokenizer=LemmaTokenizer())),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, n_iter=5, random_state=42)),])

    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
    return gs_clf.fit(text,labels)
        
def classify_text(classifier,input_data):
    return classifier.predict([input_data])[0]

text = ["Hello","Hi","How are you?","How's it going","What's up?","How have you been?","hi!","Hi how are you?",
        "Bye","Goodbye","see ya","See ya!","See you later!","See you again","Have fun!","later","Later","Later!"]
greeting = "greeting,"*8
goodbye = "goodbye,"*10
labels = greeting.split(",")[:-1] + goodbye.split(",")[:-1] 
clf = train_classifier(text,labels)
classify_text(clf,"Hello")

'greeting'

In [11]:
# Emotional Valence

#Also see: http://www.nltk.org/howto/sentiment.html (for nltk example)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#note: depending on how you installed (e.g., using source code download versus pip install), you may need to import like this:
#from vaderSentiment import SentimentIntensityAnalyzer

# --- examples -------
sentences = ["VADER is smart, handsome, and funny.",      # positive sentence example
            "VADER is not smart, handsome, nor funny.",   # negation sentence example
            "VADER is smart, handsome, and funny!",       # punctuation emphasis handled correctly (sentiment intensity adjusted)
            "VADER is very smart, handsome, and funny.",  # booster words handled correctly (sentiment intensity adjusted)
            "VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
            "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
            "VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!",# booster words & punctuation make this close to ceiling for score
            "The book was good.",                                     # positive sentence
            "The book was kind of good.",                 # qualified positive sentence is handled correctly (intensity adjusted)
            "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
            "At least it isn't a horrible book.",         # negated negative sentence with contraction
            "Make sure you :) or :D today!",              # emoticons handled
            "Today SUX!",                                 # negative slang with capitalization emphasis
            "Today only kinda sux! But I'll get by, lol"  # mixed sentiment example with slang and constrastive conjunction "but"
             ]

analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<65} {}".format(sentence, str(vs)))

VADER is smart, handsome, and funny.----------------------------- {'pos': 0.746, 'neg': 0.0, 'neu': 0.254, 'compound': 0.8316}
VADER is not smart, handsome, nor funny.------------------------- {'pos': 0.0, 'neg': 0.646, 'neu': 0.354, 'compound': -0.7424}
VADER is smart, handsome, and funny!----------------------------- {'pos': 0.752, 'neg': 0.0, 'neu': 0.248, 'compound': 0.8439}
VADER is very smart, handsome, and funny.------------------------ {'pos': 0.701, 'neg': 0.0, 'neu': 0.299, 'compound': 0.8545}
VADER is VERY SMART, handsome, and FUNNY.------------------------ {'pos': 0.754, 'neg': 0.0, 'neu': 0.246, 'compound': 0.9227}
VADER is VERY SMART, handsome, and FUNNY!!!---------------------- {'pos': 0.767, 'neg': 0.0, 'neu': 0.233, 'compound': 0.9342}
VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!--------- {'pos': 0.706, 'neg': 0.0, 'neu': 0.294, 'compound': 0.9469}
The book was good.----------------------------------------------- {'pos': 0.492, 'neg': 0.0, 'neu': 0.508, 'co

In [None]:
# Word Embeddings

# Explanation w/ pictures http://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
# How to implement: http://deeplearning.net/tutorial/rnnslu.html
# worked example: http://vene.ro/blog/word-movers-distance-in-python.html