In [38]:
import pandas as pd
import re
import nltk
from fuzzywuzzy import fuzz
import json

In [39]:
punctuation_stopword = [".", '"', ",", "?", "!", "/", "'", "-", "_", ";", ":", "&", ',"', '",', ")", "(", "Golden", "Globes", "@", "GoldenGlobes", "I", "we", "http", "://", "/", "co", "Hollywood", "Hooray"]
humor_keywords = ['haha', 'lol','hh','233','funny','joke','hilarious','comedian', 'best joke', 'hysterical']
stopwords = nltk.corpus.stopwords.words('english') + punctuation_stopword

In [40]:
retweets_freq_dict = {}
def remove_retweet_prefix(line):
    # find 'RT @abc: ' where abc's length is arbitrary
    pattern = re.compile(r'\bRT @([\w\'/]*)\b: ')

    match = re.search(pattern, line)
    if match:
        # store corresponding retweet without 'RT @' prefix
        string = match.group()[4:]
        if string in retweets_freq_dict:
            retweets_freq_dict[string] += 1
        else:
            retweets_freq_dict[string] = 1

    return re.sub(pattern, ' ', line)

In [41]:
hashtag_freq_dict = {}
def remove_hashtag(line):
    pattern = re.compile(r'#([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in hashtag_freq_dict:
                hashtag_freq_dict[match] += 1
            else:
                hashtag_freq_dict[match] = 1

    line = re.sub(pattern, ' ', line)
    return line

In [42]:
at_freq_dict = {}
def remove_at(line):
    pattern = re.compile(r'@([\w\'/]*)\b')
    matches = re.findall(pattern, line)
    if matches:
        # store corresponding hashtag
        for match in matches:
            if match in at_freq_dict:
                at_freq_dict[match] += 1
            else:
                at_freq_dict[match] = 1

    line = re.sub(pattern, ' ', line)
    return line


In [43]:
def cleanse(line):
    # replace everything to ' ' except whitespace, alphanumeric character, apostrophe, hashtag, @
    return re.sub(r'[^\w\s\'#@]', ' ', line)

In [44]:
def remove_apostrophe(text):
    # remove_apostrophe
    if text.endswith("'s"):
        return text[:-2].strip()
    return text

In [45]:
def remove_url(line):
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b')
    matches = re.findall(pattern, line)
    for match in matches:
        line = re.sub(match, ' ', line)
    return line

In [46]:
import spacy
nlp = spacy.load('en')

In [47]:
def identify_entities(data):
    entities = list(nlp(data).ents)
    tags = {}
    for entity in entities:
        if entity not in tags:
            tags[' '.join(t.orth_ for t in entity).strip()]=[entity.label_]
    return tags

In [48]:
def isHumor(text):
    for x in humor_keywords:
        if text.find(x) != -1:
            return True
    return False

In [49]:
def find_entity(data,verbose=False):
    entity_freq_dict = {}

    num = 0
    for tweet in data:
        # remove_retweet_prefix
        line = remove_retweet_prefix(tweet)
        # remove hashtag
        line = remove_hashtag(line)
        # remove @...
        line = remove_at(line)
        # remove url
        line = remove_url(line)
        # remove punctuations
        line = cleanse(line)

        match = isHumor(line)
        if match:
            tags = identify_entities(line)

            #if verbose:
                # print the first 10 occurrences
                #if num < 10:
                    #print(tweet)
                    #print(line)
                    #print(tags)
                    #print()

            for entity in tags.keys():
                entity = remove_apostrophe(entity)
                if len(entity) > 1:
                    if entity not in entity_freq_dict:
                        entity_freq_dict[
                            entity] = 1  # tried adding more weights to 'PERSON' tags but results are not good
                    else:
                        entity_freq_dict[entity] += 1
            num += 1

    print('num of matches:', num)
    return entity_freq_dict

In [50]:
df = pd.read_json(path_or_buf='gg2013.json')
data = df['text']
entity_freq_dict = find_entity(data,verbose=True)

num of matches: 6458


In [51]:
import pprint
top_10 = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:10]
names = [pair[0] for pair in top_10]
golden_globes = [name for name in names if fuzz.ratio(name.lower(), 'golden globes') > 50]
pprint.pprint(golden_globes)

[]


In [52]:
# we also consider dropping all lower cases examples or examples that contain digit(s), which are not names
def filter_names(entity_freq_dict,pair_list):
    filtered_results = []
    for pair in pair_list:
        string = ''.join(pair[0].split())
        if not all(char.islower() for char in string) and not any(char.isdigit() for char in string):
            filtered_results.append(pair)
        else:
            if pair[0] in entity_freq_dict:
                del entity_freq_dict[pair[0]]
    return filtered_results

In [53]:
for name in golden_globes:
    if name in entity_freq_dict:
        del entity_freq_dict[name]
top_results = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:10]
top_results = filter_names(entity_freq_dict,top_results)

In [54]:
names = [pair[0] for pair in top_results]
names_clusters = []

for name in names:
    # each name starts as a cluster
    cluster = [name]
    names_to_reduce = names[:]
    names_to_reduce.remove(name)

    # one vs. all comparisons
    for i in names_to_reduce:
        ratio = fuzz.ratio(name.lower(), i.lower())
        # if similarity is larger than 75 or one name is contained in the other name
        if ratio > 75 or re.search(name, i, flags=re.IGNORECASE) or re.search(i, name, flags=re.IGNORECASE):
            cluster.append(i)

    # if multiple names are identified in one cluster
    if len(cluster) > 1:
        names_clusters.append(cluster)

    print(cluster)

# find names clusters that should merge

# sort clusters
names_clusters.sort()
# sort within each cluster
names_clusters = ['|'.join(sorted(cluster)) for cluster in names_clusters]
# remove overlaps
names_clusters_reduced = [line.split('|') for line in list(set(names_clusters))]
# sort by length from shortest to longest (merge from the shortest)
names_clusters_reduced.sort(key=len)
print('\nnames clusters to merge:')
pprint.pprint(names_clusters_reduced)

['Amy Poehler', 'Amy']
['Tina Fey', 'Tina']
['Tina', 'Tina Fey']
['Amy', 'Amy Poehler']
['Will Ferrell']
['Kristen Wiig']
['Taylor Swift']
['Tonight']
['Congratulations Lena Dunham']

names clusters to merge:
[['Tina', 'Tina Fey'], ['Amy', 'Amy Poehler']]


In [56]:
# weighted frequency of an entity is defined by its frequency multiplied by its string length
def weighted_freq(element):
    return entity_freq_dict[element] * len(element)

e = entity_freq_dict.copy()
for cluster in names_clusters_reduced:
    # select the longest entity name
    selected_entity_name = max(cluster, key=weighted_freq)
    cluster.remove(selected_entity_name)
    # for names to be merged to the selected entity name
    for name in cluster:
        # if not deleted in previous cases, cumulate frequencies to the selected entity
        if name in e and selected_entity_name in e:
            e[selected_entity_name] += e[name]
            del e[name]

In [62]:
h1 = sorted(e.items(), key=lambda pair: pair[1], reverse=True)[:3]
print(h1)

[('Amy Poehler', 1918), ('Tina Fey', 1691), ('Will Ferrell', 236)]


In [30]:
# top 2 inferences for humors
best_host_prediction = [name[0] for name in top_5][:2]
json.dumps({'Humor': best_host_prediction})

'{"Humor": ["Amy Poehler", "Tina Fey"]}'

In [64]:
def getHumor(year):
    df = pd.read_json(path_or_buf='gg'+str(year)+'.json')
    data = df['text']
    entity_freq_dict = find_entity(data,verbose=True)
    top_10 = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:10]

    names = [pair[0] for pair in top_10]
    golden_globes = [name for name in names if fuzz.ratio(name.lower(), 'golden globes') > 50]
    for name in golden_globes:
        if name in entity_freq_dict:
            del entity_freq_dict[name]
    top_results = sorted(entity_freq_dict.items(), key=lambda pair: pair[1], reverse=True)[:10]
    top_results = filter_names(entity_freq_dict,top_results)

    names = [pair[0] for pair in top_results]
    names_clusters = []

    for name in names:
        # each name starts as a cluster
        cluster = [name]
        names_to_reduce = names[:]
        names_to_reduce.remove(name)

        # one vs. all comparisons
        for i in names_to_reduce:
            ratio = fuzz.ratio(name.lower(), i.lower())
            # if similarity is larger than 75 or one name is contained in the other name
            if ratio > 75 or re.search(name, i, flags=re.IGNORECASE) or re.search(i, name, flags=re.IGNORECASE):
                cluster.append(i)

        # if multiple names are identified in one cluster
        if len(cluster) > 1:
            names_clusters.append(cluster)

        print(cluster)

    # find names clusters that should merge

    # sort clusters
    names_clusters.sort()
    # sort within each cluster
    names_clusters = ['|'.join(sorted(cluster)) for cluster in names_clusters]
    # remove overlaps
    names_clusters_reduced = [line.split('|') for line in list(set(names_clusters))]
    # sort by length from shortest to longest (merge from the shortest)
    names_clusters_reduced.sort(key=len)

    # weighted frequency of an entity is defined by its frequency multiplied by its string length
    def weighted_freq(element):
        return entity_freq_dict[element] * len(element)

    e = entity_freq_dict.copy()
    for cluster in names_clusters_reduced:
        # select the longest entity name
        selected_entity_name = max(cluster, key=weighted_freq)
        cluster.remove(selected_entity_name)
        # for names to be merged to the selected entity name
        for name in cluster:
            # if not deleted in previous cases, cumulate frequencies to the selected entity
            if name in e and selected_entity_name in e:
                e[selected_entity_name] += e[name]
                del e[name]

    top_3 = sorted(e.items(), key=lambda pair: pair[1], reverse=True)[:3]

    # top 2 inferences for hosts
    best_host_prediction = [name[0] for name in top_5][:2]
    json.dumps({'Humor': best_host_prediction})

    return top_3

In [63]:
print("The best joke in 2013 is said by:")
for item in h1:
    print(item[0])

The best joke in 2013 is said by:
Amy Poehler
Tina Fey
Will Ferrell


In [65]:
h2 = getHumor(2015)
print("The best joke in 2015 is said by:")
for item in h2:
    print(item[0])

num of matches: 40196
['Amy', 'Amy Poehler']
['Tina', 'Tina Fey']
['Cosby', 'Bill Cosby']
['Jeremy Renner']
['Bill Cosby', 'Cosby']
['Amy Poehler', 'Amy']
['Tina Fey', 'Tina']
['North Korea']
['Oprah']
The best joke in 2015 is said by:
Bill Cosby
Amy Poehler
Tina
