In [4]:
import re

import pandas as pd

In [5]:
%%time
year = 2013
df = pd.read_json(path_or_buf='gg{0}.json'.format(year))
# df = pd.read_json(path_or_buf='gg2015.json')

CPU times: user 749 ms, sys: 122 ms, total: 871 ms
Wall time: 871 ms


In [6]:
# sample data if necessary
data = df['text']
sample_size = 200000
if len(df) > sample_size:
    data = data.sample(n=sample_size)
len(data)

174643

In [7]:
retweets_freq_dict = {}

def remove_retweet_prefix(line):
    # find 'RT @abc: ' where abc's length is arbitrary
    pattern = re.compile(r'\bRT @([\w\'/]*)\b: ') 
    match = re.search(pattern, line)
    if match:
        # store corresponding retweet without 'RT @' prefix
        string = match.group()[4:]
        if string in retweets_freq_dict:
            retweets_freq_dict[string] += 1
        else:
            retweets_freq_dict[string] = 1
    return re.sub(pattern, ' ', line)

In [8]:
hashtag_freq_dict = {}

def remove_hashtag(line):
    pattern = re.compile(r'#([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in hashtag_freq_dict:
                hashtag_freq_dict[match] += 1
            else:
                hashtag_freq_dict[match] = 1
    return re.sub(pattern, ' ', line)

In [9]:
at_freq_dict = {}

def remove_at(line):
    pattern = re.compile(r'@([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in at_freq_dict:
                at_freq_dict[match] += 1
            else:
                at_freq_dict[match] = 1
    return re.sub(pattern, ' ', line)

In [10]:
def remove_url(line):
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b')
    return re.sub(pattern, ' ', line)

In [11]:
def cleanse(line):
    # replace everything to ' ' except whitespace, alphanumeric character, apostrophe, hashtag, @
    return re.sub(r'[^\w\s\'#@]', ' ', line)

In [12]:
def remove_apostrophe(text):
    pattern = re.compile(r'\'s\b')
    return re.sub(pattern, ' ', text)

In [13]:
# hard-coded awards 2013-2015
awards = ['cecil b. demille award', 'best motion picture - drama', 
          'best performance by an actress in a motion picture - drama', 
          'best performance by an actor in a motion picture - drama', 
          'best motion picture - comedy or musical', 
          'best performance by an actress in a motion picture - comedy or musical', 
          'best performance by an actor in a motion picture - comedy or musical', 
          'best animated feature film', 
          'best foreign language film', 
          'best performance by an actress in a supporting role in a motion picture', 
          'best performance by an actor in a supporting role in a motion picture', 
          'best director - motion picture', 
          'best screenplay - motion picture', 
          'best original score - motion picture', 
          'best original song - motion picture', 
          'best television series - drama', 
          'best performance by an actress in a television series - drama', 
          'best performance by an actor in a television series - drama', 
          'best television series - comedy or musical', 
          'best performance by an actress in a television series - comedy or musical', 
          'best performance by an actor in a television series - comedy or musical', 
          'best mini-series or motion picture made for television', 
          'best performance by an actress in a mini-series or motion picture made for television', 
          'best performance by an actor in a mini-series or motion picture made for television', 
          'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
          'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

In [21]:
stop_words = set()
awards_words = set()
for award in awards:
    awards_words |= set(cleanse(award).split())

stop_words |= awards_words
stop_words |= {'award'}
stop_words |= {'awards'}
stop_words |= {'tv'}
stop_words |= {'movie'}
stop_words |= {'movies'}
stop_words |= {'win'}
stop_words |= {'wins'}
stop_words |= {'winner'}
stop_words |= {'winners'}
stop_words |= {'congrats'}
stop_words |= {'congratulations'}
stop_words |= {'golden'}
stop_words |= {'globe'}
stop_words |= {'globes'}
stop_words |= {'rt'}

In [22]:
import spacy
# python3 -m spacy download en
nlp = spacy.load('en')

# reference for token attributes
# https://spacy.io/api/token#attributes

def identify_entities(text, stop_words):
    tags = {}
    for ent in nlp(text).ents:
        entity = ent.text.strip()
        if entity not in tags and len(entity) > 1:
            # remove stopwords
            entity_split = [w for w in entity.split() if w.lower() not in stop_words]            
            if len(entity_split) != 0:
                entity = ' '.join(entity_split)
                if len(entity.split()) == 1 and entity.lower() == 'the':
                    pass
                else:
                    tags[entity]=[ent.label_]
    return tags

# performance may be compared with nltk.tag.stanford.StanfordTagger if we have time
# http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford

In [23]:
%%time
# generate cleansed_data beforehand

cleansed_data = []
for tweet in data:
    line = remove_retweet_prefix(tweet)
    line = remove_hashtag(line)
    line = remove_at(line)
    line = remove_url(line)
    line = cleanse(line)
    cleansed_data.append(line)
    
# remove redundancies after processing retweets
print(len(cleansed_data))
cleansed_data = list(set(cleansed_data))
print(len(cleansed_data))

174643
125587
CPU times: user 3.39 s, sys: 11.8 ms, total: 3.4 s
Wall time: 3.4 s


In [24]:
cleansed_data[:5]

['Love Tina Fays comment bout taylor swift   ',
 ' I bet the Life of Pi wins 3 14 awards tonight   ',
 'Tremendo  ',
 ' Best TV movie or miniseries actor goes to   for         ',
 'Catherine  cariño  para cantar una frase mejor no hacer nada   ']

In [27]:
def find_host(cleansed_data, awards, verbose=False):
    pattern = re.compile(r'\bhost')
    entity_freq_dict = {}
    
    num = 0
    max_entity_len = 0
    max_entity = None
    for line in cleansed_data:
        match = re.search(pattern, line.lower())
        if match:
            tags = identify_entities(line, stop_words)

            if verbose:
                # print the first 10 occurrences
                if num < 10:
                    print(tweet)
                    print(line)
                    print(tags)
                    print()

            for entity in tags.keys():
                # identify the entity with maximum length
                entity_len = len(entity)
                if entity_len > max_entity_len:
                    max_entity_len = entity_len
                    max_entity = entity
                
                if entity not in entity_freq_dict:
                    entity_freq_dict[entity] = 1 # tried adding more weights to 'PERSON' tags but results are not good
                else:
                    entity_freq_dict[entity] += 1
            num += 1           
    print('num of matches:', num)
    print('max_entity_len:', max_entity_len)
    print('max_entity:', max_entity)
    return entity_freq_dict

In [55]:
%%time

# note that for 2015 tweets, it takes more than 4min to run
# processing time may be long for large data
# consider picking the longest 150000 tweets

entity_freq_dict = find_host(cleansed_data, awards, verbose=True)

Says @BenAffleck: "I also didn't get the Acting Nomination...no one's saying I got snubbed there!" #pressroom #GoldenGlobes
Tina and Amy hosting the   was just genius    
{'Tina': ['GPE'], 'Amy': ['PERSON']}

Says @BenAffleck: "I also didn't get the Acting Nomination...no one's saying I got snubbed there!" #pressroom #GoldenGlobes
It's kind of uppsetting that Tina Fey and Amy Poehler are hosting the Golden Globes 
{'Amy Poehler': ['PERSON']}

Says @BenAffleck: "I also didn't get the Acting Nomination...no one's saying I got snubbed there!" #pressroom #GoldenGlobes
I can't wait to do my all pirate hostage rescue film  YARGO   
{}

Says @BenAffleck: "I also didn't get the Acting Nomination...no one's saying I got snubbed there!" #pressroom #GoldenGlobes
AGREE   Great     amusingly hosted  memorable speeches  mostly correct winners  Thoroughly enjoyed it 
{'Thoroughly': ['NORP']}

Says @BenAffleck: "I also didn't get the Acting Nomination...no one's saying I got snubbed there!" #pressroom

In [56]:
# top retweets (each as an entity)
sorted(retweets_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:50]

[('goldenglobes: ', 6918),
 ('eonline: ', 5782),
 ('PerezHilton: ', 4602),
 ('TheEllenShow: ', 3392),
 ('EmWatson: ', 2294),
 ('VanityFair: ', 1640),
 ('nbcsnl: ', 1486),
 ('CNNshowbiz: ', 1398),
 ('CiudadBizarra: ', 1020),
 ('BuzzFeed: ', 982),
 ('EW: ', 970),
 ('nbc: ', 930),
 ('vulture: ', 892),
 ('piersmorgan: ', 882),
 ('MARLONLWAYANS: ', 768),
 ('HuffingtonPost: ', 738),
 ('buckhollywood: ', 706),
 ('MarilynMonroeES: ', 668),
 ('TVGuide: ', 638),
 ('THR: ', 630),
 ('DavidSpade: ', 624),
 ('MTVNews: ', 614),
 ('PimpBillClinton: ', 562),
 ('cinema21: ', 562),
 ('washingtonpost: ', 550),
 ('ninagarcia: ', 516),
 ('Cosmopolitan: ', 478),
 ('peopleenespanol: ', 438),
 ('RichardCrouse: ', 434),
 ('peoplemag: ', 426),
 ('prodigalsam: ', 424),
 ('kumailn: ', 422),
 ('DannyZuker: ', 414),
 ('jianghomeshi: ', 402),
 ('HuffPostWomen: ', 400),
 ('DamienFahey: ', 400),
 ('girlsHBO: ', 396),
 ('MHarrisPerry: ', 388),
 ('DougBenson: ', 384),
 ('rogergzz: ', 382),
 ('InStyle: ', 382),
 ('Eonline

In [57]:
# top hashtags
sorted(hashtag_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:50]

[('GoldenGlobes', 220270),
 ('goldenglobes', 66226),
 ('Argo', 1684),
 ('GetGlue', 1538),
 ('Homeland', 1392),
 ('redcarpet', 1350),
 ('GoldenGlobe', 1142),
 ('Goldenglobes', 1120),
 ('JodieFoster', 972),
 ('Girls', 902),
 ('killingit', 894),
 ('GOLDENGLOBES', 840),
 ('AlfombraRojaE', 770),
 ('Skyfall', 720),
 ('GlobosdeOro', 704),
 ('RedCarpet', 696),
 ('Lincoln', 696),
 ('LesMis', 692),
 ('GIRLS', 664),
 ('Adele', 578),
 ('LesMiserables', 564),
 ('JenniferLawrence', 516),
 ('GG2013', 508),
 ('homeland', 502),
 ('GoldenGlobes2013', 498),
 ('TinaFey', 452),
 ('ERedCarpet', 448),
 ('AmyPoehler', 418),
 ('eredcarpet', 414),
 ('jodiefoster', 408),
 ('DowntonAbbey', 386),
 ('AnneHathaway', 382),
 ('girls', 376),
 ('DjangoUnchained', 368),
 ('Oscars', 352),
 ('LoveThoseLadies', 326),
 ('Django', 316),
 ('BenAffleck', 308),
 ('Globes', 298),
 ('GameChange', 292),
 ('LOVEHER', 292),
 ('goldenGlobes', 286),
 ('fb', 286),
 ('BillClinton', 284),
 ('fashion', 276),
 ('Oscar', 276),
 ('skyfall', 2

In [58]:
# top @
sorted(at_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:50]

[('goldenglobes', 6300),
 ('OfficialAdele', 2244),
 ('lenadunham', 1960),
 ('GoldenGlobes', 1726),
 ('BenAffleck', 1052),
 ('RealHughJackman', 772),
 ('PerezHilton', 722),
 ('SHO_Homeland', 714),
 ('girlsHBO', 704),
 ('SofiaVergara', 606),
 ('eonline', 580),
 ('taylorswift13', 540),
 ('TNTLA', 510),
 ('JLo', 470),
 ('PaulEpworth', 446),
 ('GirlsHBO', 364),
 ('msleamichele', 334),
 ('SelenaGomez', 328),
 ('VanessuHudgens', 326),
 ('LenaDunham', 312),
 ('steph_hart', 312),
 ('RyanGosling', 274),
 ('kerrywashington', 250),
 ('PixarBrave', 240),
 ('LesMiserables', 226),
 ('nbc', 224),
 ('Chanel', 224),
 ('NathanFillion', 210),
 ('KevalBaxi', 194),
 ('Burberry', 172),
 ('Lewis_Damian', 172),
 ('jessicaalba', 168),
 ('CHANEL', 166),
 ('tomandlorenzo', 156),
 ('nbcsnl', 154),
 ('iamdoncheadle', 152),
 ('piersmorgan', 138),
 ('CiudadBizarra', 134),
 ('CNNshowbiz', 128),
 ('BuzzFeed', 128),
 ('LeoDiCaprio', 124),
 ('EvaLongoria', 122),
 ('azizansari', 120),
 ('gabrielledoug', 120),
 ('Sarah_Hyl

In [59]:
top_100 = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:100]
top_100

[('Amy Poehler', 420),
 ('Amy', 318),
 ('Tina', 287),
 ('Tina Fey', 199),
 ('Will Ferrell', 151),
 ('next year', 137),
 ('Kristen Wiig', 121),
 ('two', 46),
 ('2014', 41),
 ('tonight', 40),
 ("next year's", 32),
 ('SNL', 28),
 ('every year', 25),
 ('Oscars next year', 22),
 ('Amy Pohler', 20),
 ('Opening Monologue', 20),
 ('Poehler', 19),
 ('this year', 19),
 ('Hollywood', 18),
 ('Hosts Tina', 17),
 ('Amy Poelher', 17),
 ('Love', 15),
 ('Wiig', 14),
 ('Oscars', 14),
 ('2013', 13),
 ('Hosts', 13),
 ('Ferrell', 12),
 ('first', 12),
 ('Cohen', 12),
 ('Kristin Wiig', 11),
 ('Ricky Gervais', 11),
 ("Amy Poehler's", 11),
 ('next years', 11),
 ('Kristen Wiig and', 10),
 ('Paul Rudd', 10),
 ('Can Tina Fey', 10),
 ('Thoroughly', 9),
 ('Kristen Wig', 8),
 ('one', 8),
 ('Host', 8),
 ('Great', 8),
 ('Can', 8),
 ('year', 7),
 ('Jodie Foster', 7),
 ('Adele', 7),
 ('Jennifer Lawrence', 7),
 ('Will Ferrel', 7),
 ('Kristin', 7),
 ('Amy Poehler on Hosting', 6),
 ('Bill Clinton', 6),
 ('Will Farrell', 6)

In [60]:
# pip3 install python-Levenshtein for 4-10x speedup
from fuzzywuzzy import fuzz

# test
print(fuzz.ratio('Tina Fey', 'Tina'))
print(fuzz.ratio('Amy Poehler', 'Amy'))
print(fuzz.ratio('Golden Globes', 'golden globes'))

67
43
85


In [61]:
# remove golden globes from names
import pprint
names = [pair[0] for pair in top_100]
golden_globes = [name for name in names if fuzz.ratio(name.lower(), 'golden globes') > 70]
pprint.pprint(golden_globes)

[]


In [50]:
# we also consider dropping all lower cases examples or examples that contain digit(s), which are not names
def filter_names(pair_list):
    filtered_results = []
    for pair in pair_list:
        string = ''.join(pair[0].split())
        if not all(char.islower() for char in string) and not any(char.isdigit() for char in string):
            filtered_results.append(pair)
        else:
            if pair[0] in entity_freq_dict:
                del entity_freq_dict[pair[0]]
    return filtered_results

In [51]:
for name in golden_globes:
    if name in entity_freq_dict:
        del entity_freq_dict[name]
top_results = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:20]
top_results = filter_names(top_results)
top_results

[('Amy Poehler', 420),
 ('Amy', 318),
 ('Tina', 287),
 ('Tina Fey', 199),
 ('Will Ferrell', 151),
 ('Kristen Wiig', 121),
 ("next year's", 32),
 ('SNL', 28),
 ('Oscars next year', 22),
 ('Amy Pohler', 20),
 ('Opening Monologue', 20),
 ('Poehler', 19),
 ('Hollywood', 18),
 ('Hosts Tina', 17)]

In [52]:
names = [pair[0] for pair in top_results]
names_clusters = []

for name in names:
    # each name starts as a cluster
    cluster = [name]
    names_to_reduce = names[:]
    names_to_reduce.remove(name)
    
    # one vs. all comparisons
    for i in names_to_reduce:
        ratio = fuzz.ratio(name.lower(), i.lower())
        # if similarity is larger than 75 or one name is contained in the other name
        if ratio > 75 or re.search(name, i, flags=re.IGNORECASE) or re.search(i, name, flags=re.IGNORECASE):
            cluster.append(i)
    
    # if multiple names are identified in one cluster
    if len(cluster) > 1:
        names_clusters.append(cluster)
    
    print(cluster)

# find names clusters that should merge
# ['Amy Poehler', 'Amy', 'Amy Poelher']
# ['Tina', 'Tina Fey']

# sort clusters
names_clusters.sort()
# sort within each cluster
names_clusters = ['|'.join(sorted(cluster)) for cluster in names_clusters]
# remove overlaps
names_clusters_reduced = [line.split('|') for line in list(set(names_clusters))]
# sort by length from shortest to longest (merge from the shortest)
names_clusters_reduced.sort(key=len)
print('\nnames clusters to merge:')
pprint.pprint(names_clusters_reduced)

['Amy Poehler', 'Amy', 'Amy Pohler', 'Poehler']
['Amy', 'Amy Poehler', 'Amy Pohler']
['Tina', 'Tina Fey', 'Hosts Tina']
['Tina Fey', 'Tina']
['Will Ferrell']
['Kristen Wiig']
["next year's"]
['SNL']
['Oscars next year']
['Amy Pohler', 'Amy Poehler', 'Amy']
['Opening Monologue']
['Poehler', 'Amy Poehler']
['Hollywood']
['Hosts Tina', 'Tina']

names clusters to merge:
[['Tina', 'Tina Fey'],
 ['Hosts Tina', 'Tina'],
 ['Amy Poehler', 'Poehler'],
 ['Amy', 'Amy Poehler', 'Amy Pohler'],
 ['Hosts Tina', 'Tina', 'Tina Fey'],
 ['Amy', 'Amy Poehler', 'Amy Pohler', 'Poehler']]


In [53]:
# weighted frequency of an entity is defined by its frequency multiplied by its string length
def weighted_freq(element):
    return entity_freq_dict[element] * len(element)

e = entity_freq_dict.copy()
for cluster in names_clusters_reduced:
    # select the entity name with highest weighted frequency
    selected_entity_name = max(cluster, key=weighted_freq)
    cluster.remove(selected_entity_name)
    # for names to be merged to the selected entity name
    for name in cluster:
        # if not deleted in previous cases, cumulate frequencies to the selected entity
        if name in e and selected_entity_name in e:
            e[selected_entity_name] += e[name]
            # reward merging
#             e[selected_entity_name] += round(e[selected_entity_name]*1/(names.index(selected_entity_name)+1-0.8) / e[name])
#             e[selected_entity_name] += round(e[selected_entity_name]*1/(names.index(selected_entity_name)+1-0.8) * (1 + 1/e[name]))
            del e[name]

In [54]:
top_10 = sorted(e.items(), key=lambda pair: pair[1], reverse=True)[:10]
top_10

[('Amy Poehler', 777),
 ('Tina Fey', 503),
 ('Will Ferrell', 151),
 ('Kristen Wiig', 121),
 ("next year's", 32),
 ('SNL', 28),
 ('Oscars next year', 22),
 ('Opening Monologue', 20),
 ('Hollywood', 18),
 ('Amy Poelher', 17)]

In [42]:
import json

# top 2 inferences for hosts
best_host_prediction = [name[0] for name in top_10][:2]
json.dumps({'Host': best_host_prediction})

'{"Host": ["Amy Poehler", "Amy"]}'