In [17]:
import pandas as pd

df = pd.read_json(path_or_buf='gg2013.json')
df.head(10)

data = df['text']

In [18]:
import re
def cleanse(line):
    # replace everything to ' ' except whitespace, alphanumeric character, apostrophe, hashtag, @
    return re.sub(r'[^\w\s\'#@]', ' ', line)

In [19]:
import spacy
# python -m spacy download en
nlp = spacy.load('en')
doc = nlp(cleanse(data[0]))
print([(w.text, w.pos_) for w in doc])

[('JLo', 'PROPN'), ("'s", 'PART'), ('dress', 'NOUN'), (' ', 'SPACE'), ('#', 'SYM'), ('eredcarpet', 'NOUN'), ('#', 'SYM'), ('GoldenGlobes', 'PROPN')]


In [155]:
def remove_retweet_prefix(line):
    # find 'RT @abc: ' where abc's length is arbitrary
    pattern = re.compile(r'RT @([\w\'/]*)\b ') 
    match = re.search(pattern, line)

    return re.sub(pattern, ' ', line).strip()

def remove_at(line):
    pattern = re.compile(r'@([\w\'/]*)\b')
    matches = re.findall(pattern, line)
        
    line = re.sub(pattern, ' ', line)
    return line

In [185]:
def print_top(n, num):
    sorted_nominees = sorted(n.items(), key=lambda e: e[1], reverse=True)
    pprint.pprint(sorted_nominees[0:num])
#     names = [pair[0] for pair in sorted_nominees]
#     pprint.pprint(names[0:num])
    print('Nominee list length:', len(n))

In [253]:
%%time
from fuzzywuzzy import fuzz

def find_nominees():
    pattern = re.compile("(win)|(won)|(lose)|(lost)|(nominat)", re.IGNORECASE)
    # pattern1 = re.compile("(\saward\s)|(\sbest\s)", re.IGNORECASE)
    persons_dic = {}

    for line in data:
        line = cleanse(line)
        line = remove_retweet_prefix(line)
        line = remove_at(line)
        match = pattern.search(line)

        # add weight
        w = 1
        if match:
            w = 5

        doc = nlp(line)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = ent.text.strip()
                # not add to list if the entity is null or golden globe related
                if name == '' or fuzz.ratio(name.lower(), 'golden globes') > 60:
                    continue

                if name in persons_dic:
                    persons_dic[name] += w
                else:
                    persons_dic[name] = w

    # filter frequency <= 10
    nominees = {key: value for key, value in persons_dic.items() if value > 10}
    # nominees = sorted(nominees.items(), key=lambda e: e[1], reverse=True)
    return nominees

nominees = find_nominees()
print_top(nominees, 200)

[('Jodie Foster', 6710),
 ('Amy Poehler', 5832),
 ('Anne Hathaway', 5520),
 ('Taylor Swift', 5506),
 ('Ben Affleck', 5352),
 ('Amy', 5320),
 ('Jennifer Lawrence', 5223),
 ('Maggie Smith', 4439),
 ('Bill Clinton', 3493),
 ('Hugh Jackman', 3226),
 ('Daniel Day Lewis', 2893),
 ('Claire Danes', 2748),
 ('Will Ferrell', 2485),
 ('Drama', 2326),
 ('Jessica Chastain', 2275),
 ('Kristen Wiig', 2216),
 ('Maggie', 2187),
 ('Skyfall', 2027),
 ('Christoph Waltz', 1965),
 ('Lena Dunham', 1882),
 ('Oscar', 1599),
 ("Jodie Foster's", 1505),
 ('Taylor', 1433),
 ('Mel Gibson', 1383),
 ('Quentin Tarantino', 1349),
 ('Tommy Lee Jones', 1347),
 ('Julianne Moore', 1330),
 ('Tina Fey', 1319),
 ('Damian Lewis', 1287),
 ('Kevin Costner', 1135),
 ('George Clooney', 1085),
 ('Adele', 1061),
 ('Clinton', 823),
 ('Ben', 813),
 ('Daniel Craig', 796),
 ('Amanda Seyfried', 790),
 ('Meryl Streep', 740),
 ('Don Cheadle', 729),
 ('Bradley Cooper', 724),
 ('Meryl', 698),
 ('Robert Downey Jr', 672),
 ('Michael J', 648),


In [260]:
def get_name_to_reduce():
    names_clusters = []
    names = list(nominees.keys())

    for name in names:
        # each name starts as a cluster
        cluster = [name]
        names_to_reduce = names[:]
        names_to_reduce.remove(name)

        # one vs. all comparisons
        for i in names_to_reduce:
            ratio = fuzz.ratio(name.lower(), i.lower())
            # if similarity is larger than 75 or one name is contained in the other name
            if ratio > 75 or re.search(name, i, flags=re.IGNORECASE) or re.search(i, name, flags=re.IGNORECASE):
                cluster.append(i)

        # if multiple names are identified in one cluster
        if len(cluster) > 1:
            names_clusters.append(cluster)

    #     print(cluster)


    # sort clusters
    names_clusters.sort()
    # sort within each cluster
    names_clusters = ['|'.join(sorted(cluster)) for cluster in names_clusters]
    # remove overlaps
    names_clusters_reduced = [line.split('|') for line in list(set(names_clusters))]
    # sort by length from shortest to longest (merge from the shortest)
    names_clusters_reduced.sort(key=len)
    print('\nnames clusters to merge:')
    pprint.pprint(names_clusters_reduced)
    print('\n')
    return names_clusters_reduced

In [263]:
# merge names using cluster
reduced_nominees = nominees.copy()
names_clusters_reduced = get_name_to_reduce()

def weighted_freq(element):
    return reduced_nominees[element] * len(element)

for cluster in names_clusters_reduced:
    # select the longest entity name
    selected_entity_name = max(cluster, key=weighted_freq)
    cluster.remove(selected_entity_name)
    # for names to be merged to the selected entity name
    for name in cluster:
        # if not deleted in previous cases, cumulate frequencies to the selected entity
        if name in reduced_nominees and selected_entity_name in reduced_nominees:
            reduced_nominees[selected_entity_name] += reduced_nominees[name]
            del reduced_nominees[name]


names clusters to merge:
[['Rick', 'Ricky Gervais'],
 ['Kushner', 'Tony Kushner'],
 ['Ah', 'Amooah'],
 ['D', 'Wildcard'],
 ['Man', 'Mani Cam'],
 ['D', 'Ditto'],
 ['Snooki', 'Snookie'],
 ['Ah', 'Ahora'],
 ['Si', 'Sorpresivo'],
 ['Wreck', 'WreckItRalph'],
 ['D', 'Edward'],
 ['Anna Karenina', 'Karen'],
 ['Kat', 'Katniss'],
 ['Su', 'Susan Boyle'],
 ['Candie', 'D'],
 ['Ah', 'Hurrah'],
 ['D', 'XazIyd7W'],
 ['J', 'Jason Statham'],
 ['Cray', 'Crazy'],
 ['Benedict Cumberbatch', 'Cumberbatch'],
 ['Atriz', 'Atriz Coadjuvante'],
 ['Saw', 'Saw Spielberg'],
 ['Isla Fisher', 'La'],
 ['D', 'Heidi Klum'],
 ['Victor Garber', "Victor Garber's"],
 ['J', 'Jacob'],
 ['Woo', 'Woot'],
 ['Kim', 'Kim Kardashian'],
 ['D', 'Daughter'],
 ['D', 'Schmidt'],
 ['Se', 'Sexy'],
 ['Best', 'Best Song'],
 ['Liz', 'Liz Lemon'],
 ['Ah', 'Woah'],
 ['Lea', 'Learn'],
 ['Honey', 'Honey Boo Boo'],
 ['Harvey Weinstein', 'Weinstein'],
 ['El', 'Feel'],
 ['D', 'Drink'],
 ['J', 'Joan Rivers'],
 ['D', 'Daisy'],
 ['Ill', 'Nathan Fillio

  'de Anne Hathaway'],
 ['D',
  'De',
  'J',
  'Je',
  'Jen',
  'Jenn',
  'Jennifer',
  'YAY Jennifer',
  'Yay Jennifer',
  'de',
  'de Jennifer'],
 ['Amy',
  'Amy Phoeler',
  'Amy Po',
  'Amy Poehler',
  "Amy Poehler's",
  'Amy Poeler',
  'Amy Poelher',
  'Amy Pohler',
  "Amy Pohler's",
  'AmyPoehler',
  'El'],
 ['D',
  'Foster',
  'J',
  'Jodi',
  'Jodi Fosters',
  'Jodie',
  'Jodie Foster',
  'Jodie Foster Speech',
  'Jodie Foster da',
  'Jodie Foster s',
  "Jodie Foster's"],
 ['J',
  'Jones',
  'Lee',
  'MM',
  'Tom',
  'Tommy',
  'Tommy Lee',
  'Tommy Lee Jones',
  "Tommy Lee Jones'",
  "Tommy Lee Jones's",
  'Ánimo Tommy Lee Jones'],
 ['Huge Jackman',
  'Hugh',
  'Hugh Jackman',
  "Hugh Jackman's",
  'J',
  'Jack',
  'Man',
  'YAY',
  'Ya',
  'Yay',
  'Yay Hugh Jackman'],
 ['J',
  'Me',
  'Mejor',
  'Mejor Actor de Reparto',
  'Mejor Actriz',
  'Mejor Actriz de Reparto',
  'Mejor Guión',
  'Mejor Película',
  'Mejor Película de Comedia',
  'al Mejor',
  'de Mejor'],
 ['D',
  'Dan

  'Yay Ben Affleck',
  'de Ben Affleck',
  'de Benjamín Button'],
 ['Ah',
  'Congrats Jennifer Lawrence',
  'J',
  'JENNIFER LAWRENCE',
  'JENNIFER LAWRENCE WON',
  'Je',
  'Jen',
  'Jenifer Lawrence',
  'Jenn',
  'Jennifer',
  'Jennifer Laurence',
  'Jennifer Lawerence',
  'Jennifer Lawrence',
  "Jennifer Lawrence '",
  'Jennifer Lawrence en la',
  'Jennifer Lawrence s',
  "Jennifer Lawrence's",
  'LAWRENCE',
  'La',
  'Law',
  'Lawrence',
  'Stunned Jennifer Lawrence',
  'Yay Jennifer Lawrence',
  'Yea',
  'Yeah Jennifer Lawrence'],
 ['Congrats Jennifer Lawrence',
  'J',
  'JENNIFER LAWRENCE',
  'JENNIFER LAWRENCE WON',
  'Je',
  'Jen',
  'Jenifer Lawrence',
  'Jenn',
  'Jennifer',
  'Jennifer Garner',
  "Jennifer Garner's",
  'Jennifer Laurence',
  'Jennifer Lawerence',
  'Jennifer Lawrence',
  'Jennifer Lawrence  Calling',
  "Jennifer Lawrence '",
  'Jennifer Lawrence en la',
  'Jennifer Lawrence s',
  "Jennifer Lawrence's",
  'LAWRENCE',
  'La',
  'Law',
  'Lawrence',
  'Stunned J

KeyError: 'D'

In [304]:
# remove names with short length
for name in list(reduced_nominees.keys()):
    if len(name) < 5:
#         print(name)
        del reduced_nominees[name]
        
print_top(reduced_nominees, 50)

[('Jodie Foster', 6710),
 ('Amy Poehler', 5832),
 ('Anne Hathaway', 5520),
 ('Taylor Swift', 5506),
 ('Ben Affleck', 5352),
 ('Jennifer Lawrence', 5223),
 ('Maggie Smith', 4439),
 ('Bill Clinton', 3493),
 ('Hugh Jackman', 3226),
 ('Daniel Day Lewis', 2893),
 ('Claire Danes', 2748),
 ('Will Ferrell', 2485),
 ('Jessica Chastain', 2275),
 ('Kristen Wiig', 2216),
 ('Christoph Waltz', 1965),
 ('Lena Dunham', 1882),
 ("Jodie Foster's", 1505),
 ('Mel Gibson', 1383),
 ('Quentin Tarantino', 1349),
 ('Tommy Lee Jones', 1347),
 ('Julianne Moore', 1330),
 ('Damian Lewis', 1287),
 ('Kevin Costner', 1135),
 ('George Clooney', 1085),
 ('Daniel Craig', 796),
 ('Amanda Seyfried', 790),
 ('Meryl Streep', 740),
 ('Don Cheadle', 729),
 ('Bradley Cooper', 724),
 ('Robert Downey Jr', 672),
 ('Jennifer Lopez', 631),
 ('Les Miserables', 583),
 ('James Cameron', 573),
 ('Jodi Foster', 549),
 ('Best Actor', 531),
 ('Kate Hudson', 520),
 ('Robert Pattinson', 519),
 ("Taylor Swift's", 486),
 ('Sarah Palin', 478),

# Link nominees to award

In [305]:
# hard-coded awards
awards = ['Best Motion Picture - Drama',
         'Best Motion Picture - Musical or Comedy',
         'Best Performance by an Actress in a Motion Picture - Drama',
         'Best Performance by an Actor in a Motion Picture - Drama',
         'Best Performance by an Actress in a Motion Picture - Musical or Comedy',
         'Best Performance by an Actor in a Motion Picture - Musical or Comedy',
         'Best Performance by an Actress in a Supporting Role in any Motion Picture',
         'Best Performance by an Actor in a Supporting Role in any Motion Picture',
         'Best Director - Motion Picture',
         'Best Screenplay - Motion Picture',
         'Best Motion Picture - Animated',
         'Best Motion Picture - Foreign Language',
         'Best Original Score - Motion Picture',
         'Best Original Song - Motion Picture',
         'Best Television Series - Drama',
         'Best Television Series - Musical or Comedy',
         'Best Television Limited Series or Motion Picture Made for Television',
         'Best Performance by an Actress in a Limited Series or a Motion Picture Made for Television',
         'Best Performance by an Actor in a Limited Series or a Motion Picture Made for Television',
         'Best Performance by an Actress In A Television Series - Drama',
         'Best Performance by an Actor In A Television Series - Drama',
         'Best Performance by an Actress in a Television Series - Musical or Comedy',
         'Best Performance by an Actor in a Television Series - Musical or Comedy',
         'Best Performance by an Actress in a Supporting Role in a Series, Limited Series or Motion Picture Made for Television',
         'Best Performance by an Actor in a Supporting Role in a Series, Limited Series or Motion Picture Made for Television',
         'Cecil B. deMille Award']

In [306]:
def reduce(line):
    # remove words "performance", "picture", "in", "a", "an", "any", "made", "for", "by", and all punctuations
    pattern = r'\bperformance\b|\bpicture\b|\bin\b|\ba\b|\ban\b|\bany\b|\bmade\b|\bfor\b|\bby\b|[^\w\s]'
    return re.sub(pattern, ' ', line.lower()).split()

awards_reduced = [sorted(set(reduce(award)), key=lambda word: reduce(award).index(word)) for award in awards]

# test
for award in awards:
    ls = reduce(award)
    print(sorted(set(ls), key=lambda word: ls.index(word)))

['best', 'motion', 'drama']
['best', 'motion', 'musical', 'or', 'comedy']
['best', 'actress', 'motion', 'drama']
['best', 'actor', 'motion', 'drama']
['best', 'actress', 'motion', 'musical', 'or', 'comedy']
['best', 'actor', 'motion', 'musical', 'or', 'comedy']
['best', 'actress', 'supporting', 'role', 'motion']
['best', 'actor', 'supporting', 'role', 'motion']
['best', 'director', 'motion']
['best', 'screenplay', 'motion']
['best', 'motion', 'animated']
['best', 'motion', 'foreign', 'language']
['best', 'original', 'score', 'motion']
['best', 'original', 'song', 'motion']
['best', 'television', 'series', 'drama']
['best', 'television', 'series', 'musical', 'or', 'comedy']
['best', 'television', 'limited', 'series', 'or', 'motion']
['best', 'actress', 'limited', 'series', 'or', 'motion', 'television']
['best', 'actor', 'limited', 'series', 'or', 'motion', 'television']
['best', 'actress', 'television', 'series', 'drama']
['best', 'actor', 'television', 'series', 'drama']
['best', 'actr

In [307]:
def generate_award_num_keywords_map(awards):
    awards_reduced = []
    award_num_keywords_map = {}
    
    for i, award in enumerate([sorted(set(reduce(award)), key=lambda word: reduce(award).index(word)) for award in awards]):
        if 'or' in award:
            num_words = len(award) - 1 - award.index('or') # number of words from "or" to the end of award name
            # test
            # print(len(award)-1, award.index('or'), num_words)
            award.remove('or')
            award_num_keywords_map[i] = len(award) - 1 - num_words # do not count "best" and words from "or" to the end of award name
        else:
            award_num_keywords_map[i] = len(award) - 1 # do not count "best"
        
#         if 'best' in award:
#             award.remove('best')

        awards_reduced.append(award)
    return award_num_keywords_map, awards_reduced

In [308]:
award_num_keywords_map, awards_reduced = generate_award_num_keywords_map(awards)

sorted(award_num_keywords_map.items(), key=lambda pair: pair[0])

[(0, 2),
 (1, 2),
 (2, 3),
 (3, 3),
 (4, 3),
 (5, 3),
 (6, 4),
 (7, 4),
 (8, 2),
 (9, 2),
 (10, 2),
 (11, 3),
 (12, 3),
 (13, 3),
 (14, 3),
 (15, 3),
 (16, 3),
 (17, 3),
 (18, 3),
 (19, 4),
 (20, 4),
 (21, 4),
 (22, 4),
 (23, 5),
 (24, 5),
 (25, 3)]

In [309]:
hashtag_freq_dict = {}

def remove_hashtag(line):
    pattern = re.compile(r'#([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in hashtag_freq_dict:
                hashtag_freq_dict[match] += 1
            else:
                hashtag_freq_dict[match] = 1
        
    line = re.sub(pattern, ' ', line)
    return line

def remove_url(line):
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b')
    matches = re.findall(pattern, line)
    for match in matches:
        line = re.sub(match, ' ', line)
    return line

In [310]:
sorted_nominees = sorted(reduced_nominees.items(), key=lambda e: e[1], reverse=True)
# nominees = sorted_nominees[0:500]
names = [pair[0] for pair in sorted_nominees]
# print(reduced_nominees['best'])

def find_award_nominees(awards_reduced, award_index, verbose=False):
    print('Predicting for:', awards[award_index])    
    award = awards_reduced[award_index]
#     print('award name reduced:', award)
    num_keywords_to_match = award_num_keywords_map[award_index]
#     print('num keywords to match:', num_keywords_to_match)
    
    # add word bounary '\b' to prevent grabbing examples like "showing" and "wonder"
#     pattern = re.compile(r'(\bwin)|(\bwon\b)|(\bbest\b)|(\b{0}\b)'.format(award[1]), re.IGNORECASE)
    entity_dict = {}
    
    num = 0
    for tweet in data:
        # remove_retweet_prefix 
        line = remove_retweet_prefix(tweet)
        # remove hashtag
        line = remove_hashtag(line)
        # remove @...
        line = remove_at(line)
        # remove url
        line = remove_url(line)
        # remove punctuations
        line = cleanse(line)
        
        pattern = re.compile(r'(\bnominate)|(\bnominee\b)'.format(award[1]), re.IGNORECASE)
        if len(set(award).intersection(set(line.split()))) > num_keywords_to_match:
            # accumulate counts when there is a match for the award
            # add weight if certain keywords show up
            match = re.findall(pattern, line.lower())
            w = 1
            if match:
                w = 5
            for name in names:
                if re.search(name, line, flags=re.IGNORECASE):
                    if name in entity_dict:
                        entity_dict[name] += w
                    else:
                        entity_dict[name] = w
    return entity_dict

In [313]:
entity_dict = find_award_nominees(awards_reduced, 0, verbose=True)
sorted_ents = sorted(entity_dict.items(), key=lambda e: e[1], reverse=True)
tops = sorted_ents[0:10]
# print(tops)
pprint.pprint( [pair[0] for pair in tops])

Predicting for: Best Motion Picture - Drama
['Julia Roberts',
 'JULIA ROBERTS',
 'Ben Affleck',
 'Jessica Chastain',
 'Daniel Day Lewis',
 'Best Actor',
 'Daniel Day',
 'Daniel day',
 'Lewis Wins',
 'Best Drama']


In [None]:
entity_dict = find_award_nominees(awards_reduced, 1, verbose=True)
sorted_ents = sorted(entity_dict.items(), key=lambda e: e[1], reverse=True)
tops = sorted_nominees[0:10]
# print(tops)
pprint.pprint( [pair[0] for pair in tops])

Predicting for: Best Motion Picture - Musical or Comedy
