In [31]:
import re
import json

import pandas as pd

In [32]:
%%time
year = 2015
df = pd.read_json(path_or_buf='gg{0}.json'.format(year))
# df = pd.read_json(path_or_buf='gg2015.json')

CPU times: user 9.85 s, sys: 1.37 s, total: 11.2 s
Wall time: 11.2 s


In [34]:
# sample data if necessary
data = df['text']
sample_size = 200000
if len(df) > sample_size:
    data = data.sample(n=sample_size)
len(data)

200000

In [35]:
%%time

retweets_freq_dict = {}

def remove_retweet_prefix(line):
    # find 'RT @abc: ' where abc's length is arbitrary
    pattern = re.compile(r'\bRT @([\w\'/]*)\b: ') 
    match = re.search(pattern, line)
    if match:
        # store corresponding retweet without 'RT @' prefix
        string = match.group()[4:]
        if string in retweets_freq_dict:
            retweets_freq_dict[string] += 1
        else:
            retweets_freq_dict[string] = 1
    return re.sub(pattern, ' ', line)

hashtag_freq_dict = {}

def remove_hashtag(line):
    pattern = re.compile(r'#([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in hashtag_freq_dict:
                hashtag_freq_dict[match] += 1
            else:
                hashtag_freq_dict[match] = 1
    return re.sub(pattern, ' ', line)

at_freq_dict = {}

def remove_at(line):
    pattern = re.compile(r'@([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in at_freq_dict:
                at_freq_dict[match] += 1
            else:
                at_freq_dict[match] = 1
    return re.sub(pattern, ' ', line)

def remove_url(line):
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b')
    return re.sub(pattern, ' ', line)

def cleanse(line):
    # replace everything to ' ' except alphanumeric character, whitespace, apostrophe, hashtag
    return re.sub(r'[^\w\s\'#]', ' ', line)

cleansed_data = []
for tweet in data:
    line = remove_retweet_prefix(tweet)
    line = remove_hashtag(line)
    line = remove_at(line)
    line = remove_url(line)
    line = cleanse(line)
    cleansed_data.append(line)
    
# remove redundancies after processing retweets
print(len(cleansed_data))
cleansed_data = list(set(cleansed_data))
print(len(cleansed_data))

200000
111967
CPU times: user 4.43 s, sys: 14.3 ms, total: 4.45 s
Wall time: 4.45 s


In [36]:
with open('gg-project-master/gg{0}answers.json'.format(year)) as file:
    ground_truth = json.load(file)
file.closed

True

In [7]:
# test
true_award_winners = ground_truth['award_data']

# for name in awards_old:
#     print(name)
#     print('true winner:', true_award_winners[name]['winner'])
#     print()

In [8]:
# hard-coded awards 2013-2015
awards = ['cecil b. demille award', 'best motion picture - drama', 
          'best performance by an actress in a motion picture - drama', 
          'best performance by an actor in a motion picture - drama', 
          'best motion picture - comedy or musical', 
          'best performance by an actress in a motion picture - comedy or musical', 
          'best performance by an actor in a motion picture - comedy or musical', 
          'best animated feature film', 
          'best foreign language film', 
          'best performance by an actress in a supporting role in a motion picture', 
          'best performance by an actor in a supporting role in a motion picture', 
          'best director - motion picture', 
          'best screenplay - motion picture', 
          'best original score - motion picture', 
          'best original song - motion picture', 
          'best television series - drama', 
          'best performance by an actress in a television series - drama', 
          'best performance by an actor in a television series - drama', 
          'best television series - comedy or musical', 
          'best performance by an actress in a television series - comedy or musical', 
          'best performance by an actor in a television series - comedy or musical', 
          'best mini-series or motion picture made for television', 
          'best performance by an actress in a mini-series or motion picture made for television', 
          'best performance by an actor in a mini-series or motion picture made for television', 
          'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
          'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

In [9]:
# hard-coded awards 2018-2019
awards_new = ['Best Motion Picture - Drama',
             'Best Motion Picture - Musical or Comedy',
             'Best Performance by an Actress in a Motion Picture - Drama',
             'Best Performance by an Actor in a Motion Picture - Drama',
             'Best Performance by an Actress in a Motion Picture - Musical or Comedy',
             'Best Performance by an Actor in a Motion Picture - Musical or Comedy',
             'Best Performance by an Actress in a Supporting Role in any Motion Picture',
             'Best Performance by an Actor in a Supporting Role in any Motion Picture',
             'Best Director - Motion Picture',
             'Best Screenplay - Motion Picture',
             'Best Motion Picture - Animated',
             'Best Motion Picture - Foreign Language',
             'Best Original Score - Motion Picture',
             'Best Original Song - Motion Picture',
             'Best Television Series - Drama',
             'Best Television Series - Musical or Comedy',
             'Best Television Limited Series or Motion Picture Made for Television',
             'Best Performance by an Actress in a Limited Series or a Motion Picture Made for Television',
             'Best Performance by an Actor in a Limited Series or a Motion Picture Made for Television',
             'Best Performance by an Actress in A Television Series - Drama',
             'Best Performance by an Actor in A Television Series - Drama',
             'Best Performance by an Actress in a Television Series - Musical or Comedy',
             'Best Performance by an Actor in a Television Series - Musical or Comedy',
             'Best Performance by an Actress in a Supporting Role in a Series, Limited Series or Motion Picture Made for Television',
             'Best Performance by an Actor in a Supporting Role in a Series, Limited Series or Motion Picture Made for Television',
             'Cecil B. deMille Award']
print(len(awards))

26


In [10]:
def reduce(line):
    # test
    # replace "motion picture" to "movie"
#     pattern = r'\bmotion picture\b'
#     line = re.sub(pattern, 'movie', line.lower())
    
    # replace "television" to "tv"
    pattern = r'\btelevision\b'
    line = re.sub(pattern, 'tv', line.lower())

    # remove words "best", "performance", "motion", "picture", "limited", "language", "role", in", "a", "an", "any", "made", "for", "by", "b.", "award", and all punctuations
#     pattern = r'\bbest\b|\bperformance\b|\bmotion\b|\bpicture\b|\blimited\b|\blanguage\b|\brole\b|\bin\b|\ba\b|\ban\b|\bany\b|\bmade\b|\bfor\b|\bby\b|\bb\b|\baward\b|[^\w\s]'
    pattern = r'\bbest\b|\bperformance\b|\blanguage\b|\brole\b|\bin\b|\ba\b|\ban\b|\bany\b|\bmade\b|\bfor\b|\bby\b|\bb\b|\baward\b|[^\w\s]'
    return re.sub(pattern, ' ', line.lower()).split()

In [11]:
# test reduce function on old awards
for award in awards:
    ls = reduce(award)
    print(sorted(set(ls), key=lambda word: ls.index(word)))

['cecil', 'demille']
['motion', 'picture', 'drama']
['actress', 'motion', 'picture', 'drama']
['actor', 'motion', 'picture', 'drama']
['motion', 'picture', 'comedy', 'or', 'musical']
['actress', 'motion', 'picture', 'comedy', 'or', 'musical']
['actor', 'motion', 'picture', 'comedy', 'or', 'musical']
['animated', 'feature', 'film']
['foreign', 'film']
['actress', 'supporting', 'motion', 'picture']
['actor', 'supporting', 'motion', 'picture']
['director', 'motion', 'picture']
['screenplay', 'motion', 'picture']
['original', 'score', 'motion', 'picture']
['original', 'song', 'motion', 'picture']
['tv', 'series', 'drama']
['actress', 'tv', 'series', 'drama']
['actor', 'tv', 'series', 'drama']
['tv', 'series', 'comedy', 'or', 'musical']
['actress', 'tv', 'series', 'comedy', 'or', 'musical']
['actor', 'tv', 'series', 'comedy', 'or', 'musical']
['mini', 'series', 'or', 'motion', 'picture', 'tv']
['actress', 'mini', 'series', 'or', 'motion', 'picture', 'tv']
['actor', 'mini', 'series', 'or', '

In [12]:
def generate_award_num_keywords_map(awards):
    awards_reduced = []
    award_num_keywords_map = {}
    for i, award in enumerate([sorted(set(reduce(award)), key=lambda word: reduce(award).index(word)) for award in awards]):
        if 'or' in award:
            # find number of words from "or" to the end of award name
            num_words = len(award) - 1 - award.index('or')
            award.remove('or')
            # do not count words from "or" to the end of award name
            award_num_keywords_map[i] = len(award) - num_words             
        else:
            award_num_keywords_map[i] = len(award)            
        awards_reduced.append(award)
    return award_num_keywords_map, awards_reduced

award_num_keywords_map, awards_reduced = generate_award_num_keywords_map(awards)

In [13]:
awards_reduced

[['cecil', 'demille'],
 ['motion', 'picture', 'drama'],
 ['actress', 'motion', 'picture', 'drama'],
 ['actor', 'motion', 'picture', 'drama'],
 ['motion', 'picture', 'comedy', 'musical'],
 ['actress', 'motion', 'picture', 'comedy', 'musical'],
 ['actor', 'motion', 'picture', 'comedy', 'musical'],
 ['animated', 'feature', 'film'],
 ['foreign', 'film'],
 ['actress', 'supporting', 'motion', 'picture'],
 ['actor', 'supporting', 'motion', 'picture'],
 ['director', 'motion', 'picture'],
 ['screenplay', 'motion', 'picture'],
 ['original', 'score', 'motion', 'picture'],
 ['original', 'song', 'motion', 'picture'],
 ['tv', 'series', 'drama'],
 ['actress', 'tv', 'series', 'drama'],
 ['actor', 'tv', 'series', 'drama'],
 ['tv', 'series', 'comedy', 'musical'],
 ['actress', 'tv', 'series', 'comedy', 'musical'],
 ['actor', 'tv', 'series', 'comedy', 'musical'],
 ['mini', 'series', 'motion', 'picture', 'tv'],
 ['actress', 'mini', 'series', 'motion', 'picture', 'tv'],
 ['actor', 'mini', 'series', 'motion'

In [14]:
def describe_awards(awards, awards_reduced, award_num_keywords_map):
    pairs = sorted(award_num_keywords_map.items(), key=lambda pair: pair[0])
    for pair in pairs:
        print(pair[0])
        print(awards[pair[0]])
        print(awards_reduced[pair[0]])
        if pair[1] < len(awards_reduced[pair[0]]):
            print('******')
        print(pair) # (index, num_keywords_to_match)    
        print()

In [15]:
# tweaking magic
for i, award in enumerate(awards_reduced):
    if 'tv' in award:
        awards_reduced[i].append('television')

describe_awards(awards, awards_reduced, award_num_keywords_map)

0
cecil b. demille award
['cecil', 'demille']
(0, 2)

1
best motion picture - drama
['motion', 'picture', 'drama']
(1, 3)

2
best performance by an actress in a motion picture - drama
['actress', 'motion', 'picture', 'drama']
(2, 4)

3
best performance by an actor in a motion picture - drama
['actor', 'motion', 'picture', 'drama']
(3, 4)

4
best motion picture - comedy or musical
['motion', 'picture', 'comedy', 'musical']
******
(4, 3)

5
best performance by an actress in a motion picture - comedy or musical
['actress', 'motion', 'picture', 'comedy', 'musical']
******
(5, 4)

6
best performance by an actor in a motion picture - comedy or musical
['actor', 'motion', 'picture', 'comedy', 'musical']
******
(6, 4)

7
best animated feature film
['animated', 'feature', 'film']
(7, 3)

8
best foreign language film
['foreign', 'film']
(8, 2)

9
best performance by an actress in a supporting role in a motion picture
['actress', 'supporting', 'motion', 'picture']
(9, 4)

10
best performance by a

In [16]:
# some ideas to improve spaCy's Name Entity Recognizer
# https://github.com/explosion/spaCy/issues/1655
# https://github.com/explosion/spacy/blob/master/examples/training/train_new_entity_type.py
# https://spacy.io/usage/training#section-ner

In [17]:
def check_target_words(awards_reduced):
    target_words = ['actor', 'actress', 'director', 'score', 'song', 'foreign', 'tv', 'screenplay']
    for word in target_words:
        if word in awards_reduced:
            return word
    return False

# def check_primary_target_words(awards_reduced):
#     target_words = ['actor', 'actress', 'director', 'score', 'song']
#     for word in target_words:
#         if word in awards_reduced:
#             return word
#     return False

check_target_words(awards_reduced[12])

'screenplay'

In [18]:
# def filter_awards_stopwords(top_results, awards_words):
#     filtered_top_results = []
#     for result in top_results:
#         if any(word in awards_words for word in cleanse(result[0]).lower().split()):
#             pass
#         else:
#             filtered_top_results.append(result)
#     return filtered_top_results

# def remove_awards_stopwords(top_results, awards_words):
#     filtered_top_results = []
#     for result in top_results:
#         word = [w for w in re.findall(r"[\w'-]+|[^\s\w]", result[0]) if not_awards_stopwords(w, awards_words)]
#         if len(word) != 0:
#             filtered_top_results.append((' '.join(word), result[1]))
#     return filtered_top_results


def not_awards_stopwords(w, stop_words):
#     if any(fuzz.ratio(w.lower(), word) > 90 for word in awards_words):
    if w.lower() in stop_words:
        return False
    else:
        return True

In [19]:
from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))
# print(stop_words)

stop_words = set()
awards_words = set()
for award in awards:
    awards_words |= set(cleanse(award).split())

stop_words |= awards_words
stop_words |= {'award'}
stop_words |= {'awards'}
stop_words |= {'tv'}
stop_words |= {'movie'}
stop_words |= {'movies'}
stop_words |= {'win'}
stop_words |= {'wins'}
stop_words |= {'winner'}
stop_words |= {'winners'}
stop_words |= {'congrats'}
stop_words |= {'congratulations'}
stop_words |= {'golden'}
stop_words |= {'globe'}
stop_words |= {'globes'}
# stop_words |= {'the'}
stop_words |= {'rt'}

# stop_words.remove('of')
# stop_words.remove('don')
# stop_words.remove('how')
# stop_words.remove('to')

stop_words

{'a',
 'actor',
 'actress',
 'an',
 'animated',
 'award',
 'awards',
 'b',
 'best',
 'by',
 'cecil',
 'comedy',
 'congrats',
 'congratulations',
 'demille',
 'director',
 'drama',
 'feature',
 'film',
 'for',
 'foreign',
 'globe',
 'globes',
 'golden',
 'in',
 'language',
 'made',
 'mini',
 'motion',
 'movie',
 'movies',
 'musical',
 'or',
 'original',
 'performance',
 'picture',
 'role',
 'rt',
 'score',
 'screenplay',
 'series',
 'song',
 'supporting',
 'television',
 'tv',
 'win',
 'winner',
 'winners',
 'wins'}

In [20]:
import spacy
# python3 -m spacy download en
nlp = spacy.load('en')

def identify_entities(text, stop_words):
    tags = {}
    for ent in nlp(text).ents:
        entity = ent.text.strip()
        if entity not in tags and len(entity) > 1:
            # remove stopwords
            entity_split = [w for w in entity.split() if w.lower() not in stop_words]            
            if len(entity_split) != 0:
                entity = ' '.join(entity_split)
                # if entity is a single word that is 'the' or in stop_words
    #             if len(entity.split()) == 1 and (entity.lower() == 'the' or entity.lower() in stop_words):
                if len(entity.split()) == 1 and entity.lower() == 'the':
                    pass
                else:
                    tags[entity]=[ent.label_]
    return tags

In [21]:
test = 'Best Animated Feature Film goes to How to Train Your Dragon 2'
identify_entities(test, stop_words)

{}

In [22]:
len("'")

1

In [27]:
def find_award_winner(awards, award_num_keywords_map, awards_reduced, award_index, stop_words, verbose=False):    
    print(award_index)
    print('Predicting for:', awards[award_index])
    # when using 2018-2019 awards names
#     print('true winner:', winners[award_index])
    
    # when using 2013-2015 awards names
    print('true winner:', true_award_winners[awards[award_index]]['winner'])
    award = awards_reduced[award_index]
    print('award name reduced:', award)
    num_keywords_to_match = award_num_keywords_map[award_index]
    print("num keywords to match:", num_keywords_to_match)
    
    # add word boundary '\b' to prevent grabbing examples like "showing" and "wonder"    
    pattern = re.compile(r'\bwin|\bwon\b|\bbest\b|\bcongrat', re.IGNORECASE)
        
    entity_freq_dict = {}
    
    num = 0
    flag = 0
    
    # find target word pattern to match for sure
    target_word = check_target_words(award)  
    if target_word:
        target_word_pattern = re.compile(r'\b{0}\b'.format(target_word), re.IGNORECASE)
        print("matching target word '{0}'".format(target_word))
    # else if primary keyword is not found, match for secondary keyword
    elif 'tv' in award:
        target_word_pattern = re.compile(r'\b{0}\b'.format('tv'), re.IGNORECASE)
        print("matching target word 'tv'")
    elif 'screenplay' in award:
        target_word_pattern = re.compile(r'\b{0}\b'.format('screenplay'), re.IGNORECASE)
        print("matching target word 'screenplay'")
    else:
        target_word_pattern = re.compile(r'\b{0}\b'.format(award[0]), re.IGNORECASE)
        print("matching target word '{0}'".format(award[0])) 

    # if len(award) != num_keywords_to_match (awards that have 'or' options)
    # and len(award) != 2 (for the case of ['musical', 'comedy']), 
    # in addition, if target word in award, target word must be matched
    if len(award) != num_keywords_to_match:
        flag = 1
        for line in cleansed_data:
            match = re.findall(pattern, line.lower())
            match_target_word = re.findall(target_word_pattern, line.lower())
            
            # if line contains at least number keywords to match and pattern is found
            num_keywords_matched = len(set(award).intersection(set(line.lower().split())))
            if (match and num_keywords_matched >= num_keywords_to_match) and match_target_word:
                # reward longer match
                weight = 10 if any('win' in tup for tup in match) else 1
                ratio = num_keywords_matched**5
                weight *= ratio

                tags = identify_entities(line, stop_words)

                if verbose:
                    # print the first 5 occurrences
                    if num < 5:
                        print(match)
                        print(line)
                        print(tags.keys())
                        print()

                for entity in tags.keys():
                    # add more weights for appropriate entity classification
                    if target_word in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                        weight += 10
                    if target_word not in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                        weight -= 10
                    if entity not in entity_freq_dict:
                        entity_freq_dict[entity] = weight
                    else:
                        entity_freq_dict[entity] += weight
                num += 1
    else:
        for line in cleansed_data:
            match = re.findall(pattern, line.lower())
            
            # if line contains at least number keywords to match and pattern is found
            num_keywords_matched = len(set(award).intersection(set(line.lower().split())))
            if match and num_keywords_matched == num_keywords_to_match:
                weight = 10 if any('win' in tup for tup in match) else 1
                weight *= num_keywords_matched
                 
                tags = identify_entities(line, stop_words)

                if verbose:
                    # print the first 5 occurrences
                    if num < 5:
                        print(match)
                        print(line)
                        print(tags.keys())
                        print()

                for entity in tags.keys():
                    # add more weights for appropriate entity classification
                    if target_word in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                        weight += 10
                    if target_word not in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                        weight -= 10
                    if entity not in entity_freq_dict:
                        entity_freq_dict[entity] = weight
                    else:
                        entity_freq_dict[entity] += weight
                num += 1
    
    # if no results found, recursively add more keywords and reduce num_keywords_to_match
    while(num == 0 or num < 10):
        print('no results found or too few matches! add more keywords and reduce num_keywords_to_match!')
                
        num_keywords_to_match -= 1
        print("num keywords to match:", num_keywords_to_match)
        if num_keywords_to_match == 0:
            print('break loop!')
            break
        
#         print('award before', award)
        # add alternative word for 'tv'
#         if 'tv' in award and 'television' not in award:
#             award.append('television')
        # add alternative word for 'Motion Picture'
#         if 'motion picture' in awards[award_index] and 'movie' not in award:
#             award.append('movie')
#         print('expanded award with more keywards', award)
        
        if flag == 1:
            if target_word:
                print("matching target word '{0}'".format(target_word))
            for line in cleansed_data:
                match = re.findall(pattern, line.lower())
                match_target_word = re.findall(target_word_pattern, line.lower())
                
                # if line contains at least number keywords to match and pattern is found
                num_keywords_matched = len(set(award).intersection(set(line.lower().split())))
                if num_keywords_matched >= num_keywords_to_match and match and match_target_word:
                    weight = 10 if any('win' in tup for tup in match) else 1
                    ratio = num_keywords_matched**5
                    weight *= ratio
                    
                    tags = identify_entities(line, stop_words)

                    if verbose:
                        # print the first 5 occurrences
                        if num < 5:
                            print(match)
                            print(line)
                            print(tags.keys())
                            print()

                    for entity in tags.keys():
                        # add more weights for appropriate entity classification
                        if target_word in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                            weight += 10
                        if target_word not in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                            weight -= 10
                        if entity not in entity_freq_dict:
                            entity_freq_dict[entity] = weight
                        else:
                            entity_freq_dict[entity] += weight
                    num += 1
        else:
            for line in cleansed_data:
                match = re.findall(pattern, line.lower())
                # if line contains at least number keywords to match and pattern is found
                num_keywords_matched = len(set(award).intersection(set(line.lower().split())))
                if num_keywords_matched == num_keywords_to_match and match:
                    
                    weight = 10 if any('win' in tup for tup in match) else 1
                    weight *= num_keywords_matched
                             
                    tags = identify_entities(line, stop_words)

                    if verbose:
                        # print the first 5 occurrences
                        if num < 5:
                            print(match)
                            print(line)
                            print(tags.keys())
                            print()

                    for entity in tags.keys():
                        # add more weights for appropriate entity classification
                        if target_word in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                            weight += 10
                        if target_word not in ['actor', 'actress', 'director'] and tags[entity] == 'PERSON':
                            weight -= 10
                        if entity not in entity_freq_dict:
                            entity_freq_dict[entity] = weight
                        else:
                            entity_freq_dict[entity] += weight
                    num += 1
    print('num of matches:', num)
    return entity_freq_dict

# Predictions for Award Winners

In [28]:
from fuzzywuzzy import fuzz

def remove_goldeb_globes(top_results, entity_freq_dict):
#     golden_globes = [name for name in [pair[0] for pair in top_results] if fuzz.ratio(name.lower(), 'goldenglobes') > 60]
#     for name in golden_globes:
#         if name in entity_freq_dict:
#             del entity_freq_dict[name]
    
    for name in [pair[0] for pair in top_results]:
        if fuzz.ratio(name.lower(), 'goldenglobes') > 60:
            if name in entity_freq_dict:
                del entity_freq_dict[name]
    
    for name in [pair[0] for pair in top_results]:
        if len(name) < 2 or name.lower == 'the':
            if name in entity_freq_dict:
                del entity_freq_dict[name]
    return entity_freq_dict


def filter_names(pair_list, entity_freq_dict):
    stop_words = set(stopwords.words('english'))
    filtered_results = []
    for pair in pair_list:
        string = ''.join(pair[0].split())
        # remove all lower cases examples or examples that contain digit(s), which are not names
        if not all(char.islower() for char in string) and not any(char.isdigit() for char in string):
            # remove single stopwords
            if len(pair[0].split()) == 1 and pair[0].lower() in stop_words and pair[0] in entity_freq_dict:
                del entity_freq_dict[pair[0]]
            # remove single character
            elif len(pair[0]) == 1 and pair[0] in entity_freq_dict:
                del entity_freq_dict[pair[0]]
            else:
                filtered_results.append(pair)
        else:
            if pair[0] in entity_freq_dict:
                del entity_freq_dict[pair[0]]
    return filtered_results, entity_freq_dict


def merge_names(top_results, entity_freq_dict):
    names = [pair[0] for pair in top_results]
    names_clusters = []
    # form clusters from the top candidates
    for name in names:
        # each name starts as a cluster
        cluster = [name]
        names_to_reduce = names[:]
        names_to_reduce.remove(name)
        # one vs. all comparisons
        for i in names_to_reduce:
            ratio = fuzz.ratio(name.lower(), i.lower())
            # if similarity is larger than 75 or one name is contained in the other name
            if ratio > 75 or re.search(name, i, flags=re.IGNORECASE) or re.search(i, name, flags=re.IGNORECASE):
                cluster.append(i)
        # if multiple names are identified in one cluster
        if len(cluster) > 1:
            names_clusters.append(cluster)

    # find names clusters that should merge
    # ['Amy Poehler', 'Amy', 'Amy Poelher']
    # ['Tina', 'Tina Fey']

    # sort clusters
    names_clusters.sort()
    # sort within each cluster
    names_clusters = ['|'.join(sorted(cluster)) for cluster in names_clusters]
    # remove overlaps
    names_clusters_reduced = [line.split('|') for line in list(set(names_clusters))]
    # sort by length from shortest to longest (merge from the shortest)
    names_clusters_reduced.sort(key=len)

    # weighted frequency of an entity is defined by its frequency multiplied by its string length
    def weighted_freq(element):
        return entity_freq_dict[element] * len(element)

    e = entity_freq_dict.copy()
    for cluster in names_clusters_reduced:
        # select the entity name with highest weighted frequency
        selected_entity_name = max(cluster, key=weighted_freq)
        cluster.remove(selected_entity_name)
        # for names to be merged to the selected entity name
        for name in cluster:
            # if not deleted in previous cases, cumulate frequencies to the selected entity
            if name in e and selected_entity_name in e:
                e[selected_entity_name] += e[name]
                # reward merging
#                 e[selected_entity_name] += round(e[selected_entity_name]*1/(names.index(selected_entity_name)+1-0.88))
#                 if top_results[0][1] < 2*top_results[1][1]:
#                     selected_name_i = names.index(selected_entity_name)
#                     name_i = names.index(name)
#                     if selected_name_i < 4:
#                         if name_i >= 4:
#                             # reward strength is proportional to indices' distance
#                             e[selected_entity_name] += e[name]*abs(selected_name_i - name_i)*1.5
#                         else:
#                             e[selected_entity_name] += e[name]*0.5
                del e[name]

    top_10 = sorted(e.items(), key=lambda pair: pair[1], reverse=True)[:10]
    return top_10

In [29]:
%%time

# test by changing key
if year == 2013:
    key = 14
    key = 25
elif year == 2015:
    key = 25
    key = 21
    
awards_reduced_copy = awards_reduced[:]

entity_freq_dict = find_award_winner(awards, award_num_keywords_map, awards_reduced_copy, key, stop_words, verbose=True)   
top_results = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:100]
# remove 'golden globes' from identified host names
entity_freq_dict = remove_goldeb_globes(top_results, entity_freq_dict)

top_results = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:20]
# filter for names if necessary
if 'actor' in awards_reduced[key] or 'actress' in awards_reduced[key] or 'director' in awards_reduced[key]:
    top_results, entity_freq_dict = filter_names(top_results, entity_freq_dict)
print('top results:')
print(top_results)
print('\ntop results after merging:')
top_10 = merge_names(top_results, entity_freq_dict)
print(top_10)

21
Predicting for: best mini-series or motion picture made for television
true winner: fargo
award name reduced: ['mini', 'series', 'motion', 'picture', 'tv', 'television']
num keywords to match: 2
matching target word 'tv'
['best']
  Best Actor in a TV Series  Musical or Comedy  Jeffrey Tambor  Transparent 
dict_keys(['Jeffrey Tambor Transparent'])

['best', 'congrat']
The Affair Snags The Golden Globe For Best TV Drama Series   Congrats to The Affair 
The fabulou         
dict_keys(['The Affair Snags The', 'The Affair'])

['congrat', 'win', 'best']
Congratulations to      Winner for Best Supporting Actor in a Series  Mini Series or TV Movie  So well deserved  
dict_keys([])

['best']
Best tv series drama goes to         
dict_keys([])

['best']
Best Actress in a TV Series  Comedy    Gina Rodriguez    TAKE THAT GOLDEN GLOBE HOME YAAAAAAAAS
dict_keys(['Gina Rodriguez', 'HOME'])

num of matches: 949
top results:
[('Maggie Gyllenhaal', 63185), ('Fargo', 49032), ('Matt Bomer', 46428), ('B

In [30]:
%%time

award_winner_dict = {}
for key in award_num_keywords_map.keys():
    entity_freq_dict = find_award_winner(awards, award_num_keywords_map, awards_reduced, key, stop_words, verbose=False)
    top_results = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:100]
    # remove 'golden globes' from identified host names
    entity_freq_dict = remove_goldeb_globes(top_results, entity_freq_dict)
    top_results = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:30]
    # filter for names if necessary
    if 'actor' in awards_reduced[key] or 'actress' in awards_reduced[key] or 'director' in awards_reduced[key]:
        top_results, entity_freq_dict = filter_names(top_results, entity_freq_dict)
    print('top results:')
    print(top_results)
    print('\ntop results after merging:')
    top_10 = merge_names(top_results, entity_freq_dict)
    print(top_10)
    
    if len(top_10) != 0:
        award_winner_dict[awards[key].lower()] = top_10[0][0]
    else:
        award_winner_dict[awards[key].lower()] = ''
    print()

for item in award_winner_dict.items():
    print(item)

0
Predicting for: cecil b. demille award
true winner: george clooney
award name reduced: ['cecil', 'demille']
num keywords to match: 2
matching target word 'cecil'
num of matches: 31
top results:
[('George Clooney', 256), ('Clooney', 42), ('King', 40), ('80s', 40), ('tonight', 40), ('2015', 40), ('Lifetime Achievement', 20), ('Hollywood Press', 20), ('the 2015', 20), ('The ha', 20), ('second', 20), ('Walt Disney', 20), ('1953', 20), ('NBC Photo Bank', 20), ('Somehow', 20), ("George Clooney's", 20), ('Facts of Life', 20), ('Roseanne', 20), ('Hollywood', 20), ('Fun', 20), ('Righteous', 20), ('today', 20), ('George', 4), ('Bruce Wayne', 2), ('Brains', 2), ('Leslie', 2), ('Ideal', 2), ('Jim Carrey', 2)]

top results after merging:
[('George Clooney', 322), ('2015', 60), ('King', 40), ('80s', 40), ('Hollywood Press', 40), ('tonight', 40), ('Lifetime Achievement', 20), ('The ha', 20), ('second', 20), ('Walt Disney', 20)]

1
Predicting for: best motion picture - drama
true winner: boyhood
awa

num of matches: 147
top results:
[('Leviathan', 862), ('Russia', 322), ('to Leviathan', 80), ('Russian', 70), ('2015', 46), ("Leviathan'", 40), ('Putin', 40), ("Leviathan' ABC News", 40), ("Masha Gessen's", 40), ('tonight', 24), ('IDA', 22), ('first', 20), ('Brutal', 20), ('Eager', 20), ("Russia's '", 20), ('Damning', 20), ('Trailer', 20), ('Me', 20), ('Welll', 20), ('one', 20), ('U S', 20), ('News', 20), ('Russians', 20), ('George P', 20), ('Russia⁰', 20), ('Free', 20), ('Leviathan Crushes', 20), ('World', 20), ('READ', 20), ('Richard Linklater', 20)]

top results after merging:
[('Leviathan', 1062), ('Russia', 452), ('2015', 46), ('Putin', 40), ("Masha Gessen's", 40), ('tonight', 24), ('IDA', 22), ('first', 20), ('Brutal', 20), ('Eager', 20)]

9
Predicting for: best performance by an actress in a supporting role in a motion picture
true winner: patricia arquette
award name reduced: ['actress', 'supporting', 'motion', 'picture']
num keywords to match: 4
matching target word 'actress'


num of matches: 330
top results:
[('Gina Rodriguez', 237848), ('Transparent', 196124), ('Jeffrey Tambor', 134424), ('Amazon', 118735), ('Transparent The', 40960), ('first', 35580), ('Jane the Virgin', 35146), ('Los Angeles Times', 30720), ('her 1st', 20480), ('Transparent the', 20480), ('to Gina Rodriguez', 20480), ('2nd', 20480), ('HOLY', 20480), ('Transparent Amazon Instant', 20480), ('The Virgin', 16627), ('Jane', 16454), ('Jane The Virgin', 14336), ('the Virgin', 13764), ('CW', 11264), ('VIRGIN', 11264), ('Kudos', 10240), ('Woohoo', 10240), ('PerezHilton', 10240), ('tonight', 10240), ('WOW', 10240), ("Associated Press '", 10240), ('to', 10240), ('year', 10240), ('Sun', 10240), ('Powe', 10240)]

top results after merging:
[('Transparent', 396779), ('Gina Rodriguez', 258328), ('Jeffrey Tambor', 134424), ('Jane the Virgin', 107591), ('first', 35580), ('Los Angeles Times', 30720), ('her 1st', 20480), ('2nd', 20480), ('tonight', 20480), ('HOLY', 20480)]

19
Predicting for: best performa

num of matches: 244
top results:
[('Matt Bomer', 733072), ('Billy Bob Thornton', 263765), ('The Normal Heart', 222174), ('Fargo', 122400), ('Miniseries', 88064), ('Billy Bob Thorton', 86560), ('J K Simmons', 76800), ('Whiplash', 56320), ('JK Simmons', 34816), ('NORMAL', 16807), ('K Simmons', 11264), ("'The Normal Heart", 10848), ('First', 10240), ('Live', 10240), ('J K', 10240), ('EW', 10240), ("'Fargo'", 10240), ('One MIniseries', 10240), ('Gay', 10240), ('Jeremy Renner', 10240), ('J Lo', 10240), ('State Farm', 10240), ('Sup', 7776), ('PHOTO', 6250)]

top results after merging:
[('Matt Bomer', 733072), ('Billy Bob Thornton', 350325), ('The Normal Heart', 249829), ('J K Simmons', 133120), ('Fargo', 132640), ('Miniseries', 98304), ('Whiplash', 56320), ('First', 10240), ('Live', 10240), ('EW', 10240)]

('cecil b. demille award', 'George Clooney')
('best motion picture - drama', 'Boyhood')
('best performance by an actress in a motion picture - drama', 'Julianne Moore')
('best performance 