In [1]:
# based on Andrew Hintermeier (andhint) preprocessing.py
# -*- coding: utf-8 -*-
# Set packages
import re
import json
import operator
from collections import Counter
from nltk.corpus import stopwords
import string

In [2]:
# Deal with emoji using Regex

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with dash,underscore,apostrophe 
    #NOTE: NBA-TV is a valid value; words w/ an apostrophe;
    #NOTE: Need to tag entities maybe
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

# Tokenize emoji content using Regex
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

# Tokenize other content using Regex
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)


In [3]:
# defining stopwords
# ? Should punctuation be used in sentiment analysis
# ? Does punctuation contribute to content understanding
# ? Does removing punctuation affect emoji analysis

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['RT', 'via', 'The' , '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '10']
# NOTE: stop words contain letters after apostrophes, need different solution

In [4]:
# Define useful functions to tokenize and preprocess tweets
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens


In [5]:
# Select a file to preprocess 
fileName = 'AtlantaDream052916.json'

# open the file with these parameters, while setting up the counter variables
with open(fileName, 'r') as f:
    countAll = Counter()
    countTerms = Counter()
    countHash = Counter()
    countMention = Counter()

    for line in f:
        tweet = json.loads(line) # load it as Python dict
        tokenList = preprocess(tweet['text'].encode('ascii', 'ignore')) # list of tokens created from this tweet
        
        termsAll = [term for term in tokenList if term not in stop]  # all terms including stop words
        termsTerms = [term for term in tokenList if term not in stop and not term.startswith(('#','@'))]  # terms with stop words removed
        termsHash = [term for term in tokenList if term.startswith('#')]
        termsMention = [term for term in tokenList if term.startswith('@')]

        # update the counter variables
        countAll.update(termsAll)
        countTerms.update(termsTerms)
        countHash.update(termsHash)
        countMention.update(termsMention)

In [7]:
# Print top 10 of each counter
print("Top 5 terms (stopwords removed)")
print(countAll.most_common(10))
print
print("Top 5 terms with stopwords, mentions, and hashtags removed")
print(countTerms.most_common(10))
print
print("Top 5 Hashtags")
print(countHash.most_common(10))
print
print("Top 5 Mentions")
print(countMention.most_common(10))


Top 5 terms (stopwords removed)
[('@AtlantaDream', 858), ('@IndianaFever', 372), ('@NBATV', 239), ('lead', 187), ('#RWTD', 184), ('@WNBA', 162), ('#WatchMeWork', 101), ('#WNBA20', 98), ('@Catchin24', 94), ('NBA', 88)]

Top 5 terms with stopwords, mentions, and hashtags removed
[('lead', 187), ('NBA', 88), ('TV', 88), ('Dream', 86), ('travel', 78), ('battle', 77), ('NEXT', 77), ('South', 77), ('points', 76), ('today', 75)]

Top 5 Hashtags
[('#RWTD', 184), ('#WatchMeWork', 101), ('#WNBA20', 98), ('#WNBA', 54), ('#AtlantaDream', 52), ('#GameDay', 17), ('#keepitup', 12), ('#IndianaFever', 9), ('#ATLDream', 7), ('#WhatIfTimes', 7)]

Top 5 Mentions
[('@AtlantaDream', 858), ('@IndianaFever', 372), ('@NBATV', 239), ('@WNBA', 162), ('@Catchin24', 94), ('@TiffMitch25', 66), ('@8Cortijo', 61), ('@angel_35', 54), ('@tiphayes3', 49), ('@FOXSportsSE', 47)]
