In [1]:
import re
import json

import pandas as pd

In [2]:
%%time
year = 2013
df = pd.read_json(path_or_buf='gg{0}.json'.format(year))
# df = pd.read_json(path_or_buf='gg2015.json')

CPU times: user 747 ms, sys: 107 ms, total: 854 ms
Wall time: 854 ms


In [3]:
# sample data if necessary
data = df['text']
sample_size = 200000
if len(df) > sample_size:
    data = data.sample(n=sample_size)
len(data)

174643

In [4]:
%%time

retweets_freq_dict = {}

def remove_retweet_prefix(line):
    # find 'RT @abc: ' where abc's length is arbitrary
    pattern = re.compile(r'\bRT @([\w\'/]*)\b: ') 
    match = re.search(pattern, line)
    if match:
        # store corresponding retweet without 'RT @' prefix
        string = match.group()[4:]
        if string in retweets_freq_dict:
            retweets_freq_dict[string] += 1
        else:
            retweets_freq_dict[string] = 1
    return re.sub(pattern, ' ', line)

hashtag_freq_dict = {}

def remove_hashtag(line):
    pattern = re.compile(r'#([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in hashtag_freq_dict:
                hashtag_freq_dict[match] += 1
            else:
                hashtag_freq_dict[match] = 1
    return re.sub(pattern, ' ', line)

at_freq_dict = {}

def remove_at(line):
    pattern = re.compile(r'@([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in at_freq_dict:
                at_freq_dict[match] += 1
            else:
                at_freq_dict[match] = 1
    return re.sub(pattern, ' ', line)

def remove_url(line):
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b')
    return re.sub(pattern, ' ', line)

def cleanse(line):
    # replace everything to ' ' except alphanumeric character, whitespace, apostrophe, hashtag
    return re.sub(r'[^\w\s\'#]', ' ', line)

CLEANSED_DATA = []
for tweet in data:
    line = remove_retweet_prefix(tweet)
    line = remove_hashtag(line)
    line = remove_at(line)
    line = remove_url(line)
    line = cleanse(line)
    CLEANSED_DATA.append(line)
    
# remove redundancies after processing retweets
print(len(CLEANSED_DATA))
CLEANSED_DATA = list(set(CLEANSED_DATA))
print(len(CLEANSED_DATA))

174643
125585
CPU times: user 3.46 s, sys: 16.6 ms, total: 3.48 s
Wall time: 3.48 s


In [5]:
print(len(cleansed_data))
cleansed_data[:5]

125585


['Julianne Moore representing Tom Ford  Best dressed of the night  hands down   ',
 'LINCOLN    Como siempre Steven Spielberg haciendo Historia   ',
 'amo amo amo amo amo os velhinhos do     hsuahsuahsua',
 "       excuse but I need about 6 minutes to Masterbate to JLO's ass in this dress",
 'I went 21 25   ']

In [6]:
# hard-coded awards 2013-2015
awards = ['cecil b. demille award', 'best motion picture - drama', 
          'best performance by an actress in a motion picture - drama', 
          'best performance by an actor in a motion picture - drama', 
          'best motion picture - comedy or musical', 
          'best performance by an actress in a motion picture - comedy or musical', 
          'best performance by an actor in a motion picture - comedy or musical', 
          'best animated feature film', 
          'best foreign language film', 
          'best performance by an actress in a supporting role in a motion picture', 
          'best performance by an actor in a supporting role in a motion picture', 
          'best director - motion picture', 
          'best screenplay - motion picture', 
          'best original score - motion picture', 
          'best original song - motion picture', 
          'best television series - drama', 
          'best performance by an actress in a television series - drama', 
          'best performance by an actor in a television series - drama', 
          'best television series - comedy or musical', 
          'best performance by an actress in a television series - comedy or musical', 
          'best performance by an actor in a television series - comedy or musical', 
          'best mini-series or motion picture made for television', 
          'best performance by an actress in a mini-series or motion picture made for television', 
          'best performance by an actor in a mini-series or motion picture made for television', 
          'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
          'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

stop_words = set()
awards_words = set()
for award in awards:
    awards_words |= set(cleanse(award).split())

stop_words |= awards_words
stop_words |= {'award'}
stop_words |= {'awards'}
stop_words |= {'tv'}
stop_words |= {'movie'}
stop_words |= {'movies'}
stop_words |= {'win'}
stop_words |= {'wins'}
stop_words |= {'winner'}
stop_words |= {'winners'}
stop_words |= {'congrats'}
stop_words |= {'congratulations'}
stop_words |= {'golden'}
stop_words |= {'globe'}
stop_words |= {'globes'}
stop_words |= {'rt'}
stop_words |= {'{0}'.format(year)}
# stop_words

In [7]:
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

def find_sentiment_score(line, verbose=False):
    sentiment_dict = SentimentIntensityAnalyzer().polarity_scores(sentence)
    if verbose:
        print(sentiment_dict)
    return sentiment_dict['neg'], sentiment_dict['neu'], sentiment_dict['pos']

In [8]:
# retrieve hosts from hosts.ipynb
# retrieve nominees from nominees.ipynb
# retrieve winners from winners.ipynb
# retrieve presenters from presenters.ipynb


# example
subjects = ['Les Miserables', 'Tina Fey', 'Jay Leno']
subjects = ['Tina Fey']

In [9]:
import spacy
# python3 -m spacy download en
nlp = spacy.load('en')

# reference: https://spacy.io/api/annotation
def identify_entities_pos(text, stop_words):
    tags = {}
    for ent in nlp(text):
        entity = ent.text.strip()
        if entity not in tags and len(entity) > 1:
            # remove stopwords
            entity_split = [w for w in entity.split() if w.lower() not in stop_words]            
            if len(entity_split) != 0:
                entity = ' '.join(entity_split)
                if (len(entity.split()) == 1 and entity.lower() == 'the') or len(entity) == 1:
                    pass
                else:
                    tags[entity]=[ent.tag_]
                
    return tags

In [10]:
# test
doc = nlp(u'Apple is so pretty')
for token in doc:
    print(token.text, token.pos_, token.tag_, token.dep_)
    
tags = identify_entities_pos('Exam is so difficult!', stop_words)
print(tags)
print(tags['difficult'][0])

Apple PROPN NNP nsubj
is VERB VBZ ROOT
so ADV RB advmod
pretty ADJ JJ acomp
{'Exam': ['NN'], 'is': ['VBZ'], 'so': ['RB'], 'difficult': ['JJ']}
JJ


In [15]:
def find_sentiments(subject, verbose=False):
    pattern = re.compile(r'\b{0}\b'.format(subject), re.IGNORECASE)
    num = 0
    sentiment_freq_dict = {}

    for line in cleansed_data:
        match = re.search(pattern, line.lower())
        if match:
            tags = identify_entities_pos(line, stop_words)
            for entity in tags.keys():
                if tags[entity][0] == 'JJ':
                    if entity not in sentiment_freq_dict:
                        sentiment_freq_dict[entity] = 1
                    else:
                        sentiment_freq_dict[entity] += 1
                
            if verbose:
                # print the first 10 occurrences
                if num < 10:
                    print(line)

                    
            num += 1
            if num == 500:
                break
            
    print('subject:', subject)
    print('num of matches:', num)
    
    top_sentiments = sorted(sentiment_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:3]
    top_sentiments = [pair[0] for pair in top_sentiments]
    return top_sentiments

In [16]:
%%time
find_sentiments('tina fey', verbose=False)

subject: tina fey
num of matches: 500
CPU times: user 16.9 s, sys: 1.88 s, total: 18.8 s
Wall time: 3.15 s


['hilarious', 'funny', 'amazing']