# Outline

* Collect some tweets
* Annotate the tweets 
* Calculate the accuracy

In [1]:
from pprint import pprint

# Collect some data

In [2]:
# we'll use data from an HPT job that collected tweets about parenting
tweet_bodies = [body for body in open('tweet_bodies.txt')]

In [3]:
# sanity checks
pprint(len(tweet_bodies))

103


In [4]:
# sanity checks
pprint(tweet_bodies[:10])

['&gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY '
 "IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl\n",
 "Thank y'all @RRExpress 4 hosting our kids workshop,gave out abt 600 kits and "
 'had a ton of fun!!@jermeybotkin @Blair88833704 @sm6574 @bjp84\n',
 'About Trump digest (2/7): I started creating Trump digest playlists in early '
 "2016 to teach my kids about Trump, even though most can't vote\n",
 'And now a fantastic electric storm overhead.  The original and best son et '
 'lumiere. Other guests all dining indoors. They must be mad!\n',
 'Why Young Kids Learn Through Movement ----&gt;https://t.co/YyF7HAL04u via '
 '@TheAtlantic https://t.co/RiFHdl2ipy\n',
 'Love this story by @wrivey about a creative way of leaving notes for your '
 'kids. https://t.co/ZlmnLgQm0R\n',
 'Love this story by @wrivey about a creative way of leaving notes for your '
 'kids. https://t.co/nSBM2rejh0\n',
 'Love this story by @wrivey about a creative way o

In [5]:
# lets do some quick deduplication
from duplicate_filter import duplicateFilter

## set the similarity threshold at 90%
dup_filter = duplicateFilter(0.9)

deduped_tweet_bodies = []
for id,tweet_body in enumerate(tweet_bodies):
    if not dup_filter.isDup(id,tweet_body):
        deduped_tweet_bodies.append(tweet_body)

pprint(deduped_tweet_bodies[:10])

['&gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY '
 "IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl\n",
 "Thank y'all @RRExpress 4 hosting our kids workshop,gave out abt 600 kits and "
 'had a ton of fun!!@jermeybotkin @Blair88833704 @sm6574 @bjp84\n',
 'About Trump digest (2/7): I started creating Trump digest playlists in early '
 "2016 to teach my kids about Trump, even though most can't vote\n",
 'And now a fantastic electric storm overhead.  The original and best son et '
 'lumiere. Other guests all dining indoors. They must be mad!\n',
 'Why Young Kids Learn Through Movement ----&gt;https://t.co/YyF7HAL04u via '
 '@TheAtlantic https://t.co/RiFHdl2ipy\n',
 'Love this story by @wrivey about a creative way of leaving notes for your '
 'kids. https://t.co/ZlmnLgQm0R\n',
 'I love my kids, Man. These are just 3 out of 8. They all grown up now. '
 '#lippoldislegend… https://t.co/Z0IM0ITgRX\n',
 "Now through 8/31, you can be the change 

# Annotate the data

## Start by tokenizing

In [7]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
tokenized_deduped_tweet_bodies = [tt.tokenize(body) for body in deduped_tweet_bodies]

In [8]:
# sanity checks
len(tokenized_deduped_tweet_bodies)

98

In [9]:
pprint(tokenized_deduped_tweet_bodies[:2])

[['>',
  '>',
  '>',
  'NOTHING',
  'THE',
  'LIBERAL',
  'PARTY',
  'HAS',
  'PRODUCED',
  'HAS',
  'CREDIBILITY',
  'IN',
  'IT',
  ',',
  '&',
  "THEY'RE",
  'NOT',
  'GOING',
  'TO',
  'START',
  'NOW',
  '!',
  '!',
  '!',
  'https://t.co/oH5YfQcFCl'],
 ['Thank',
  "y'all",
  '@RRExpress',
  '4',
  'hosting',
  'our',
  'kids',
  'workshop',
  ',',
  'gave',
  'out',
  'abt',
  '600',
  'kits',
  'and',
  'had',
  'a',
  'ton',
  'of',
  'fun',
  '!',
  '!',
  '@jermeybotkin',
  '@Blair88833704',
  '@sm6574',
  '@bjp84']]


## Now tag the tokens with parts-of-speech labels

The default configuration is the Greedy Averaged Perceptron tagger (https://explosion.ai/blog/part-of-speech-pos-tagger-in-python)

In [10]:
from nltk.tag import pos_tag as pos_tagger
tagged_tokenized_deduped_tweet_bodies = [ pos_tagger(tokens) for tokens in tokenized_deduped_tweet_bodies] 

In [11]:
pprint(tagged_tokenized_deduped_tweet_bodies[:2])

[[('>', 'JJ'),
  ('>', 'NNP'),
  ('>', 'NNP'),
  ('NOTHING', 'NNP'),
  ('THE', 'NNP'),
  ('LIBERAL', 'NNP'),
  ('PARTY', 'NNP'),
  ('HAS', 'NNP'),
  ('PRODUCED', 'NNP'),
  ('HAS', 'NNP'),
  ('CREDIBILITY', 'NNP'),
  ('IN', 'NNP'),
  ('IT', 'NNP'),
  (',', ','),
  ('&', 'CC'),
  ("THEY'RE", 'NNP'),
  ('NOT', 'NNP'),
  ('GOING', 'NNP'),
  ('TO', 'NNP'),
  ('START', 'NNP'),
  ('NOW', 'NNP'),
  ('!', '.'),
  ('!', '.'),
  ('!', '.'),
  ('https://t.co/oH5YfQcFCl', 'NN')],
 [('Thank', 'NNP'),
  ("y'all", 'NN'),
  ('@RRExpress', 'NN'),
  ('4', 'CD'),
  ('hosting', 'VBG'),
  ('our', 'PRP$'),
  ('kids', 'NNS'),
  ('workshop', 'NN'),
  (',', ','),
  ('gave', 'VBD'),
  ('out', 'RP'),
  ('abt', 'JJ'),
  ('600', 'CD'),
  ('kits', 'NNS'),
  ('and', 'CC'),
  ('had', 'VBD'),
  ('a', 'DT'),
  ('ton', 'NN'),
  ('of', 'IN'),
  ('fun', 'NN'),
  ('!', '.'),
  ('!', '.'),
  ('@jermeybotkin', 'JJ'),
  ('@Blair88833704', 'NN'),
  ('@sm6574', 'NNP'),
  ('@bjp84', 'NN')]]


In [12]:
# let's look at the taxonomy of tags; in our case derived from the Penn treebank project 
# (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.8216&rep=rep1&type=pdf)

import nltk
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [13]:
# let's peek at the tag dictionary for our tagger

from nltk.tag.perceptron import PerceptronTagger
t = PerceptronTagger()
pprint(list(t.tagdict.items())[:10])

[('Ortega', 'NNP'),
 ('Thomson', 'NNP'),
 ('Canada', 'NNP'),
 ('exchanges', 'NNS'),
 ('Prime', 'NNP'),
 ('natural', 'JJ'),
 ('went', 'VBD'),
 ('``', '``'),
 ('1986', 'CD'),
 ('night', 'NN')]


# Evaluate the annotations

We must choose which parts of speech to evaluate. Let's focus on adjectives, which are useful for sentiment analysis, and proper nouns, which provide a set of potential events and topics. 

* JJ: adjective or numeral, ordinal
* JJR: adjective, comparative
* JJS: adjective, superlative


* NNP: noun, proper, singular
* NNPS: noun, proper, plural

In [14]:
adjective_tags = ['JJ','JJR','JJS']
pn_tags = ['NNP','NNPS']
tag_types = [('adj',adjective_tags),('PN',pn_tags)]

In [15]:
# print format: "POS: TOKEN --> TWEET TEXT"

for body,tweet_tokens,tagged_tokens in zip(deduped_tweet_bodies,tokenized_deduped_tweet_bodies,tagged_tokenized_deduped_tweet_bodies):
    for token,tag in tagged_tokens:
        if tag in adjective_tags:
        #if tag in pn_tags:
            print_str = '{}: {} --> {}'.format(tag,token,body)
            print(print_str)

JJ: > --> &gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl

JJ: abt --> Thank y'all @RRExpress 4 hosting our kids workshop,gave out abt 600 kits and had a ton of fun!!@jermeybotkin @Blair88833704 @sm6574 @bjp84

JJ: @jermeybotkin --> Thank y'all @RRExpress 4 hosting our kids workshop,gave out abt 600 kits and had a ton of fun!!@jermeybotkin @Blair88833704 @sm6574 @bjp84

JJS: digest --> About Trump digest (2/7): I started creating Trump digest playlists in early 2016 to teach my kids about Trump, even though most can't vote

JJ: early --> About Trump digest (2/7): I started creating Trump digest playlists in early 2016 to teach my kids about Trump, even though most can't vote

JJS: most --> About Trump digest (2/7): I started creating Trump digest playlists in early 2016 to teach my kids about Trump, even though most can't vote

JJ: can't --> About Trump digest (2/7): I started creating Tru

These seem like dreadful results. Let's try a different NLP engine.

## Stanford CoreNLP

Download:

http://nlp.stanford.edu/software/stanford-corenlp-full-2016-10-31.zip

Then unzip. Start up the server from the unzipped directory:

`$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000`

In [18]:
from corenlp_pywrap import pywrap
cn = pywrap.CoreNLP(url='http://localhost:9000', annotator_list=["pos"])

In [19]:
corenlp_results = []

for tweet_body in deduped_tweet_bodies:
    try:
        corenlp_results.append( cn.basic(tweet_body,out_format='json').json() )
    except UnicodeEncodeError:
        corenlp_results.append( {'sentences':[]} )


In [20]:
# pull out the tokens and tags
corenlp_tagged_tokenized_deduped_tweet_bodies = [ [(token['word'],token['pos']) for sentence in result['sentences'] for token in sentence['tokens']] for result in corenlp_results]

In [21]:
# print format: "POS: TOKEN --> TWEET TEXT"

for body,tagged_tokens in zip(deduped_tweet_bodies,corenlp_tagged_tokenized_deduped_tweet_bodies):
    for token,tag in tagged_tokens:
        #if tag in pn_tags:
        if tag in adjective_tags:
            print_str = '{}: {} --> {}'.format(tag,token,body)
            print(print_str)

JJR: > --> &gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl

JJR: > --> &gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl

JJR: > --> &gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl

JJR: > --> &gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl

JJR: > --> &gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl

JJ: LIBERAL --> &gt;&gt;&gt;&gt;&gt;NOTHING THE LIBERAL PARTY HAS PRODUCED HAS CREDIBILITY IN IT, &amp; THEY'RE NOT GOING TO START NOW!!! https://t.co/oH5YfQcFCl

JJ: early --> About Trump digest (2/7): I start

# Conclusions and next steps

For Tweet bodies:
* NLTK TweetTokenizer is pretty good
* NLTK default POS tagger is dreadful
* CoreNLP POS tagger is better than NLTK

Next steps:

* Make more careful accuracy measurements
* Get a better tokenizer into CoreNLP
* Look at other POS taggers
* Compare other tags, such as sentiment