In [1]:
fname = 'trump_new.json'

In [2]:
import json
 
with open(fname, 'r') as f:
    line = f.readline() # read only the first tweet/line
    tweet = json.loads(line) # load it as Python dict
    print(json.dumps(tweet, indent=4)) # pretty-print


{
    "created_at": "Fri Mar 24 13:00:23 +0000 2017",
    "place": null,
    "source": "<a href=\"https://path.com/\" rel=\"nofollow\">Path</a>",
    "favorited": false,
    "retweeted_status": {
        "created_at": "Fri Mar 24 12:59:58 +0000 2017",
        "place": null,
        "source": "<a href=\"http://www.thebeltwaypundit.com\" rel=\"nofollow\">PostFromMyMac</a>",
        "is_quote_status": false,
        "favorited": false,
        "text": "#TRUMP #REGRETS: SHOULD HAVE DONE #TAX CUTS FIRST... https://t.co/9B1gbsC2bW",
        "truncated": false,
        "retweet_count": 25,
        "in_reply_to_status_id_str": null,
        "id_str": "845258889549504512",
        "coordinates": null,
        "lang": "en",
        "id": 845258889549504512,
        "in_reply_to_status_id": null,
        "possibly_sensitive": false,
        "in_reply_to_user_id_str": null,
        "in_reply_to_user_id": null,
        "contributors": null,
        "in_reply_to_screen_name": null,
        "entities

In [4]:
!pip install nltk

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
#Tokenise a tweet
from nltk.tokenize import word_tokenize
 
tweet = 'RT @callaghanmt: just an example! :D http://example.com #NLP'
print(word_tokenize(tweet))


LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/home/nbuser/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

In [5]:
#A better tokeniser, taking into account twitter type tokens
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [6]:
tweet = 'RT @callaghanmt: just an example! :D http://example.com #NLP'
print(preprocess(tweet))

['RT', '@callaghanmt', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']


In [7]:
#find common terms
import operator 
import json
from collections import Counter
 
with open(fname, 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        # Create a list with all the terms
        terms_all = [term for term in preprocess(tweet['text'])]
        # Update the counter
        count_all.update(terms_all)
    # Print the first 5 most frequent words
    print(count_all.most_common(5))

[(':', 1898), ('.', 1718), ('#Trump', 1478), ('RT', 1230), ('…', 1096)]


In [8]:
#remove some stopwords
from nltk.corpus import stopwords
import string
 
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', '…', 'RT', 'amp']


In [9]:
#apply this
terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]


In [10]:
#find common terms
import operator 
import json
from collections import Counter
 
with open(fname, 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        # Create a list with all the terms
        terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]
        # Update the counter
        count_all.update(terms_stop)
    # Print the first 5 most frequent words
    print(count_all.most_common(5))

[('#Brexit', 624), ('#brexit', 153), ('EU', 144), ('Brexit', 126), ('The', 87)]


In [11]:
#now look at bigrams
from nltk import bigrams 
 
terms_bigram = bigrams(terms_stop)

In [12]:
import operator 
import json
from collections import Counter
from nltk import bigrams 
  
with open(fname, 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        # Create a list with all the terms
        terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]
        terms_bigram = bigrams(terms_stop)
        # Update the counter
        count_all.update(terms_bigram)
    # Print the first 5 most frequent words
    print(count_all.most_common(20))

[(('#BREXIT', '#StrongerIn'), 49), (('#StrongerIn', '#No2EU'), 43), (('#No2EU', '#EUref'), 41), (('#EUref', '#LeaveEU'), 35), (('#Trump', '#LePen'), 29), (('leave', 'EU'), 28), (('Must-read', 'book'), 27), (('#iceberg', 'Must-read'), 27), (('Populist', '#iceberg'), 27), (('Just', 'times'), 27), (('#LePen', '#factcheck'), 27), (('book', '@jnpaquet'), 27), (('In', 'campaign'), 27), (('#Brexit', 'Tip'), 27), (('confirmed', 'If'), 27), (('Tip', 'Populist'), 27), (('PLEASE', 'Just'), 27), (('campaign', 'confirmed'), 27), (('If', 'leave'), 27), (('times', 'Stronger'), 27)]


In [13]:
#looking at term co-occurences
#removing # and @ tokens
from collections import defaultdict
# remember to include the other import from the previous post
 
com = defaultdict(lambda : defaultdict(int))

with open(fname, 'r') as f:
# f is the file pointer to the JSON data set
    for line in f: 
        count_terms_only = Counter()
        tweet = json.loads(line)
        terms_only = [term for term in preprocess(tweet['text']) 
                  if term not in stop 
                  and not term.startswith(('#', '@'))]
 
        # Build co-occurrence matrix
        for i in range(len(terms_only)-1):            
            for j in range(i+1, len(terms_only)):
                w1, w2 = sorted([terms_only[i], terms_only[j]])                
                if w1 != w2:
                    com[w1][w2] += 1


In [14]:
com_max = []
# For each term, look for the most common co-occurrent terms
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
print(terms_max[:15])

[(('Just', 'leave'), 54), (('Stronger', 'leave'), 54), (('If', 'leave'), 54), (('leave', 'times'), 54), (('PLEASE', 'leave'), 54), (('confirmed', 'leave'), 54), (('campaign', 'leave'), 54), (('In', 'leave'), 54), (('EU', 'leave'), 46), (('Mar', 'leave'), 36), (('Single', 'leave'), 36), (('boards', 'workers'), 30), (('cliff', 'edge'), 30), (('Brexit', 'May'), 29), (('means', 'workers'), 28)]


## Data viz

In [17]:
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
 
word_freq = count_all.most_common(20)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}
bar = vincent.Bar(data, iter_idx='x')
ax = AxisProperties(
         labels = PropertySet(angle=ValueRef(value=90)))
bar.axes[0].properties = ax
bar.to_json('term_freq.json')

In [None]:
# Now view in the web browser by running the python web server:
# python -m http.server 6789

# then in browser, do:
# http://localhost:6789/chart.html

## Sentiment analysis

Lots of approaches to sentiment analysis- take this one for example:

http://arxiv.org/abs/cs/0212032

*Thumbs Up or Thumbs Down? Semantic Orientation Applied to Unsupervised Classification of Reviews.*

Peter Turney
    
