In [3]:
import pandas as pd
import re
from functools import partial
from ourtypes.category import Category
from ourtypes.award import Award
import pickle 
import numpy
from rank_bm25 import BM25Okapi
import spacy
with open("common_words_hosts.pickle", "rb") as pckl:   
    common_words_host_list = pickle.load(pckl)


Set up tweets df

In [6]:
preprocessed_tweets = "gg2013_preprocessed.json"
tweets = pd.read_json(preprocessed_tweets, orient='records', lines=True)[["text","is_english"]]
tweets = tweets[tweets["is_english"]] #only keep english tweets
del tweets["is_english"] #get rid of this row
tweets = tweets.dropna(subset=["text"])

tweets = tweets.map(lambda x: x.lower())

In [7]:
tweets

Unnamed: 0,text
0,whats making sofia vergaras boobs stay like th...
1,rt fabsugar kerry washington is everything dyi...
2,anne hathaway has got me living goldenglobes
3,rt billmc wait whats that smell everyone being...
4,hugh jackman is so awesome goldenglobes
...,...
89416,ben affleck celebrates his win backstage golde...
89417,rt authorvivianna i was sad that mandy patinki...
89418,golden globes lots of fashion messesbut glad a...
89419,did they have mug shots at the golden globes d...


In [3]:
#get hosts for entire ceremony
#get award names for the ceremony

In [8]:
def remove_common_words(text, word_list):
    #negative positive golden globes
    words = [word for word in text.split(" ") if word not in word_list]
    return " ".join(words)

In [9]:
def remove_rt(text):
    text = text.split(" ")
    
    if "rt" in text:
        i = text.index("rt")
        return remove_rt(" ".join(text[0:i]) + " " + " ".join(text[i+2:]))
    return " ".join(text)

function to get hosts

In [10]:
def hosts_helper(text, host_funcs, cohost_funcs, single_cohost_funcs, hosts):
    #check for cohosts first
    for func in cohost_funcs:
        m = func.search(text)
        if m is not None:
            name1 = remove_common_words(m["name1"], common_words_host_list)
            name2 = remove_common_words(m["name2"], common_words_host_list)
            hosts.vote_contender(name1, cocontender=name2)
            hosts.vote_contender(name2, cocontender=name1)
            return name1, name2
    #check for hosts
    for func in host_funcs:
        m = func.search(text)
        if m is not None:
            name = remove_common_words(m["name"], common_words_host_list)
            hosts.vote_contender(name)
            return name
    for func in single_cohost_funcs:
        m = func.search(text)
        if m is not None:
            name = remove_common_words(m["name"], common_words_host_list)
            hosts.vote_contender(name)
            return name
    return None

In [11]:
def get_hosts(tweets):
    hosts = Category(type="hosts") #our host storage object
    host_tweets = tweets[tweets["text"].str.contains("host")] #get tweets with the word host in it
    #del host_tweets["timestamp"] #we don't need this 
    host_tweets = host_tweets.map(remove_rt) #get rid of retweets
    #handle single names or no spaces?
    host_patterns = ['host (?P<name>[a-z]+ [a-z]+)', '(?P<name>[a-z]+ [a-z]+) (hosting|is hosting|will host|hosts|hosted)']
    cohost_patterns = ['(co-?|)hosts (?P<name1>[a-z]+ [a-z]+)( and | )(?P<name2>[a-z]+ [a-z]+)',
                    '(?P<name1>[a-z]+ [a-z]+)( and | )(?P<name2>[a-z]+ [a-z]+) (are (co-?|)hosting|will (co-?|)host|(co-?|)host|(co-?|)hosting|hosted)[^s]']
    single_cohost_patterns = ['cohost (?P<name>[a-z]+ [a-z]+)',
                    '(?P<name>[a-z]+ [a-z]+) (is cohosting|will cohost|cohosts|cohosting|cohosted)[^s]']
    #compile regex functions
    host_funcs = [re.compile(pat) for pat in host_patterns]
    cohost_funcs = [re.compile(pat) for pat in cohost_patterns]
    single_cohost_funcs = [re.compile(pat) for pat in single_cohost_patterns]

    #apply helper function 
    host_tweets = host_tweets.map(partial(hosts_helper, host_funcs=host_funcs, cohost_funcs=cohost_funcs, single_cohost_funcs=single_cohost_funcs, hosts=hosts))
    del host_tweets
    return hosts

In [12]:
h = get_hosts(tweets)
vc = h.total_votes()
#match misspelling and single name 

In [13]:
vc

[('amy poehler', 295.5, ('tinafey', 131)),
 ('tina fey', 253.5, ('amypoehler', 129)),
 ('kristen wiig', 79.0, ('willferrell', 55)),
 ('will ferrell', 69.5, ('kristenwiig', 54)),
 ('should', 49, ('', 19)),
 ('amp amy', 38, ('poehler', 18)),
 ('will', 26, ('', 4)),
 ('poehler should', 26, ('amy', 20)),
 ('amy should', 14, ('tina', 11)),
 ('fey amp', 13, ('amypoehler', 11)),
 ('amy pohler', 12, ('tinafey', 8)),
 ('people', 12, ('', 6)),
 ('killing', 10.5, ('', 6)),
 ('doing', 8.0, ('', 9)),
 ('quite', 8, ('charming', 6)),
 ('rickygervais', 7.5, ('', 2)),
 ('her', 7, ('reviews', 2)),
 ('ladies', 6.5, ('', 3)),
 ('tina amp', 6, ('amy', 3)),
 ('please', 6, ('', 4)),
 ('fey should', 6, ('tina', 6)),
 ('wiig should', 6, ('kristen', 4)),
 ('grace', 6, ('', 5)),
 ('pohler', 5.5, ('', 3)),
 ('amy poehlers', 5, ('tinafey', 5)),
 ('only', 5, ('heroes', 1)),
 ('tina should', 5, ('amy', 4)),
 ('theyre', 5, ('elmo', 2)),
 ('poehler please', 5, ('amy', 3)),
 ('poehler amp', 5, ('tinafey', 5)),
 ('going

## Part Two
- have awards as input
- get presenters for awards
- get nominees for awards
- get winners for awards

In [9]:
awards_list = ["best performance by an actor in a television series - comedy or musical", 
          "best performance by an actor in a television series - drama",
          "best performance by an actor in a motion picture - drama",
          "best performance by an actress in a mini-series or motion picture made for television",
          "best original song - motion picture", 
          "best animated feature film",
"best television series - comedy or musical",
          "best performance by an actor in a mini-series or motion picture made for television", 
          "best television series - drama", 
          "best performance by an actress in a supporting role in a motion picture",
          "best performance by an actor in a supporting role in a series, mini-series or motion picture made for television", 
          "best motion picture - drama",
          "best performance by an actor in a motion picture - comedy or musical", 
          "cecil b. demille award", 
          "best performance by an actress in a motion picture - drama",
          "best performance by an actress in a television series - drama", 
          "best original score - motion picture", 
          "best mini-series or motion picture made for television",
          "best performance by an actress in a motion picture - comedy or musical", 
          "best motion picture - comedy or musical", 
          "best performance by an actress in a supporting role in a series, mini-series or motion picture made for television",
          "best performance by an actor in a supporting role in a motion picture",
          "best foreign language film", 
          "best performance by an actress in a television series - comedy or musical",
          "best director - motion picture", "best screenplay - motion picture"]

"best performance by an actress in a mini-series or motion picture made for television",
"best original song - motion picture", 
          "best animated feature film",
"best television series - comedy or musical",
          "best performance by an actor in a mini-series or motion picture made for television", 
          "best television series - drama", 
          "best performance by an actress in a supporting role in a motion picture",
          "best performance by an actor in a supporting role in a series, mini-series or motion picture made for television", 
          "best motion picture - drama",
          "best performance by an actor in a motion picture - comedy or musical", 
          "cecil b. demille award", 
          "best performance by an actress in a motion picture - drama",
          "best performance by an actress in a television series - drama", 
          "best original score - motion picture", 
          "best mini-series or motion picture made for television",
          "best performance by an actress in a motion picture - comedy or musical", 
          "best motion picture - comedy or musical", 
          "best performance by an actress in a supporting role in a series, mini-series or motion picture made for television",
          "best performance by an actor in a supporting role in a motion picture",
          "best foreign language film", 
          "best performance by an actress in a television series - comedy or musical",
          "best director - motion picture", "best screenplay - motion picture"]

Format awards 

In [10]:
def format_awards(awards): 
    results = {}
    person_patterns = "actor|cecil|actress|direction|director|choreo|perfomance"
    thing_patterns = "series|mini-series|movie|film|music|comedy|song|score|motion picture"
    person_func = re.compile(person_patterns)
    thing_func = re.compile(thing_patterns)
    for award in awards:
        m = person_func.search(award)
        if m is not None:
            results[award] = Award(award, winner_type="Person")
            continue
        m = thing_func.search(award)
        if m is not None:
            results[award] = Award(award, winner_type="Thing")
        else:
            results[award] = Award(award)
    return results
        

In [11]:
def award_aliases(key, val):
    #remove performance by an
    #television -> tv
    #television series -> tv
    #remove feature
    #in a supporting role - supporting actress/actor 
    #film -> movie
    #motion picture -> movie
    key = key.replace(".", "")
    #key = key.replace("made for television", "tv")
    #key = key.replace("movie", "movie film")
    aliases = [key]
    if "performance by an" in key:
        aliases += list(map(lambda x: x.replace("performance by an", ""), aliases))
    if "foreign language" in key:
        aliases += list(map(lambda x: x.replace("foreign language", "foreign"), aliases))
    if "television series" in key:
        aliases += list(map(lambda x: x.replace("television series", "tv"), aliases))
    if "television" in key:
        aliases += list(map(lambda x: x.replace("television", "tv"), aliases))
    if "motion picture" in key:
        aliases += list(map(lambda x: x.replace("motion picture", "movie"), aliases))
    if "feature" in key:
        aliases += list(map(lambda x: x.replace("feature", ""), aliases))
    if "original" in key:
        aliases += list(map(lambda x: x.replace("original", ""), aliases))
    
    if "actor in a supporting role" in key:
        aliases += list(map(lambda x: x.replace("actor in a supporting role", "supporting actor"),aliases))
    elif "actress in a supporting role" in key:
        aliases += list(map(lambda x: x.replace("actress in a supporting role", "supporting actress"), aliases))
    if "motion picture made for television" in key:
        aliases += list(map(lambda x: x.replace("motion picture made for television", "tv movie"),aliases))
    if "director" in key:
        aliases += list(map(lambda x: x[:x.find("-")], aliases))

    aliases += list(map(lambda x: x.replace("-", ""), aliases))
    aliases = list(map(lambda x: x.replace("  ", " "), aliases))
    
    val.add_alias(aliases)

In [12]:
def awards_struct(awards_list):
    awards = format_awards(awards_list)
    for k, v in awards.items():
        award_aliases(k,v)
    return awards

### Winners

add bm25 search as well

In [13]:
tweets = tweets.map(remove_rt)
corpus = tweets.to_numpy()
tokenized_corpus = [str(tweet).split(" ") for tweet in corpus]
bm25 = BM25Okapi(tokenized_corpus)

KeyboardInterrupt: 

In [None]:
spacy_model = spacy.load("en_core_web_sm")

In [None]:
def bm25_search(award_name):
    q = award_name.split(" ")
    scores = bm25.get_scores(q)
    indices = numpy.argsort(scores)
    relevant = tweets.iloc[indices]
    return relevant[-500:]

In [None]:
def winner_stop_words(tweet1, list2=False):
    tweet = str(tweet1)
    toreplace = ["goldenglobes", "anclerts", "award for", "just", "goes to", "mr president", "love him", "for winning",
                 "has known", "finally", "yes", "at the", " is ", "first", " for ", "bazinga rs", "amen",
                 "no surpres", "well deserved", "no surprises", "this generation", " to ", "goldenglobe", "goldenglobes"]
    if list2: toreplace = ["golden", "live", "blog", "globes", "annual", "award for", "goldenglobes", "award", "awards"]
    for ele in toreplace:
        tweet = tweet.replace(ele, " ")
    
    if "http" in tweet:
        tweetList = tweet.split(" ")
        for ele in tweetList:
            if "http" in ele:
                tweetList.remove(ele)
        tweet = " ".join(tweetList)
    if tweet == tweet1:
        tweet = tweet.replace(".", "").replace(" - "," ")
        return tweet.replace("  "," ").replace("  "," ")
    else:
        return winner_stop_words(tweet)

In [None]:
def winner_helper(tweet, v, won_funcs):
    if v.winner_type == "Person": tweet = winner_stop_words(tweet)
    else: tweet = winner_stop_words(tweet, list2=True)
        
    for f in won_funcs:
        m = f.findall(tweet)
        if m != []:
            if isinstance(m[0], tuple):
                for ele in m[0]:
                    if v.winner_type != "Person":
                        i = ele.find("by")
                        if i != -1:
                            ele = ele[:i]
                    else:
                        if "win" in ele or "won" in ele:
                            continue

                    v.add_winner(ele)
                    v.add_nominee(ele)
                    if v.winner_type != "Person": break
            else:
                ele = m[0]
                if v.winner_type != "Person":
                    i = ele.find("by")
                    if i != -1:
                        ele = ele[:i]
                else:
                    if "win" in ele or "won" in ele:
                        continue
                v.add_winner(ele)
                v.add_nominee(ele) 
                if v.winner_type != "Person": break
    return

In [None]:
def nominee_helper(tweet, v, won_funcs):
    if v.winner_type == "Person": tweet = winner_stop_words(tweet)
    else: tweet = winner_stop_words(tweet, list2=True)
        
    for f in won_funcs:
        m = f.findall(tweet)
        if m != []:
            if isinstance(m[0], tuple):
                for ele in m[0]:
                    if v.winner_type != "Person":
                        i = ele.find("by")
                        if i != -1:
                            ele = ele[:i]
                    else:
                        if "win" in ele or "won" in ele:
                            continue

                    v.add_nominee(ele)
                    if v.winner_type != "Person": break
            else:
                ele = m[0]
                if v.winner_type != "Person":
                    i = ele.find("by")
                    if i != -1:
                        ele = ele[:i]
                else:
                    if "win" in ele or "won" in ele:
                        continue
                v.add_nominee(ele) 
                if v.winner_type != "Person": break
    return

In [None]:
def extra_winner_helper(tweet, v):
    spacy_output = spacy_model(tweet)
    for entity in spacy_output.ents:
        if entity.label_ == "PERSON":
            #v.add_winner(entity.text)
            v.add_presenter(entity.text)
    return

In [None]:
"""awards = awards_struct(awards_list)
for k, v in awards.items():
    award_names = v.aliases
    
    won_patterns = [f"{award} (?P<name>[a-z]+ ?[a-z-]+)" for award in award_names]
    won_patterns += [f"(?P<name>[a-z]+ ?[a-z-]+)( wins? | on winning | has? won | got| wins the | ){award}" for award in award_names]
    won_funcs = [re.compile(ele) for ele in won_patterns]
    #presenter_patterns = [f"(?P<name>[a-z]+ ?[a-z]+)( presents? | presenting | are presenting ){award}" for award in award_names]
    #presenter_funcs = [re.compile(ele) for ele in presenter_patterns]
    relevant = bm25_search(award_names[0])
    if v.winner_type == "Person":
        r = relevant.map(partial(winner_helper, v=v, won_funcs=won_funcs))
        if v.winners.contenders == {}:
            relevant = bm25_search(award_names[-1])
            r = relevant.map(partial(extra_winner_helper, v=v))
    else:
        won_patterns = [f"{award} [for ]?(?P<name>[a-z ]+)" for award in award_names]
        won_patterns += [f"(?P<name>[a-z ]+)( wins? | on winning | has? won | got| wins the | ){award}" for award in award_names]
        won_funcs = [re.compile(ele) for ele in won_patterns]
        r = relevant.map(partial(winner_helper, v=v, won_funcs=won_funcs))"""


'awards = awards_struct(awards_list)\nfor k, v in awards.items():\n    award_names = v.aliases\n    \n    won_patterns = [f"{award} (?P<name>[a-z]+ ?[a-z-]+)" for award in award_names]\n    won_patterns += [f"(?P<name>[a-z]+ ?[a-z-]+)( wins? | on winning | has? won | got| wins the | ){award}" for award in award_names]\n    won_funcs = [re.compile(ele) for ele in won_patterns]\n    #presenter_patterns = [f"(?P<name>[a-z]+ ?[a-z]+)( presents? | presenting | are presenting ){award}" for award in award_names]\n    #presenter_funcs = [re.compile(ele) for ele in presenter_patterns]\n    relevant = bm25_search(award_names[0])\n    if v.winner_type == "Person":\n        r = relevant.map(partial(winner_helper, v=v, won_funcs=won_funcs))\n        if v.winners.contenders == {}:\n            relevant = bm25_search(award_names[-1])\n            r = relevant.map(partial(extra_winner_helper, v=v))\n    else:\n        won_patterns = [f"{award} [for ]?(?P<name>[a-z ]+)" for award in award_names]\n     

In [None]:
awards = awards_struct(awards_list)
for k, v in awards.items():
    award_names = v.aliases

   
    relevant = bm25_search(award_names[-1])
    relevant = relevant[relevant["text"].str.contains("present")]
    r = relevant.map(partial(extra_winner_helper, v=v))
    presenter = re.compile("(?P<name>[a-z]+ a-z]+)")
    

In [None]:
for k in awards.keys():
    print(awards[k].presenters)

Name: jay leno, Votes: 1
Name: jimmy, Votes: 1
Name: jennifer lawrence, Votes: 1
Name: jennifer lawrence, Votes: 1
Name: downton abbey, Votes: 2
Name: dennis quaid, Votes: 1
Name: kerry washington, Votes: 1

Name: george w bush, Votes: 1
Name: cohen, Votes: 1
Name: jay leno, Votes: 1
Name: jimmy, Votes: 1
Name: downton abbey, Votes: 2
Name: dennis quaid, Votes: 1
Name: kerry washington, Votes: 1
Name: lea michele, Votes: 2
Name: anne hathaway, Votes: 1
Name: jennifer lawrence, Votes: 1
Name: downton abbey, Votes: 1
Name: dennis quaid, Votes: 1
Name: kerry washington, Votes: 1
Name: jennifer lawrence, Votes: 1
Name: downton abbey, Votes: 2
Name: dennis quaid, Votes: 1
Name: kerry washington, Votes: 1
Name: jennifer lawrence, Votes: 1
Name: jay leno, Votes: 1
Name: jimmy, Votes: 1
Name: jay leno, Votes: 1
Name: jennifergarner, Votes: 1
Name: tommy lee jones, Votes: 1
Name: julia roberts, Votes: 1
Name: rdj, Votes: 1
Name: jodie, Votes: 2
Name: robert downey jr, Votes: 2
Name: robert down

## Get Award Names

Function to get awards

def clean_and_sort_text(text):
    # function to group award names, even if they are missing a key word or a "-"
    regex_pattern =  r"(motion+| original+|)"
    text = text.replace(regex_pattern, ' ')
    words = text.replace("-", ' ').split()
    # sort words and join back into a string
    return ' '.join(sorted(words))


def extract_award_names(df, text_column='text'):
    extracted_award_names = []
    # indentify which tweets might contain an award name
    keywords = ['wins', 'awarded', 'goes to']
    pattern = '|'.join(keywords)
    candidate_tweets = df[df[text_column].str.contains(pattern, regex=True)]

    # matching regex to all the tweets
    regex_pattern = r"(wins|awarded|goes to)(.*?)(award)?($|[,.!?:;])"
    for tweet in candidate_tweets[text_column]:
        matches = re.findall(regex_pattern, tweet)
        for match in matches:
            award_name = match[1].strip()
            if award_name:
                extracted_award_names.append(award_name)

    # clean up the results so we can rank them
    delimiters = r"( http+| for +| at +| goldenglobes +| odds-on +| and +| goes +)"
    cleaned_award_names = [re.split(delimiters, name, 1)[0] for name in extracted_award_names]
    cleaned_award_names = [name for name in cleaned_award_names if name.lower().startswith('best') and len(name.split()) > 1]
    cleaned_award_df = pd.DataFrame(cleaned_award_names, columns=['Cleaned_Award_Name'])
    cleaned_award_counts = cleaned_award_df['Cleaned_Award_Name'].value_counts().reset_index()
    cleaned_award_counts.columns = ['Cleaned_Award_Name', 'Frequency']

    cleaned_award_counts['Sorted_Award_Name'] = cleaned_award_counts['Cleaned_Award_Name'].apply(clean_and_sort_text)
    
    # chooses the longest award name of the grouping
    def prefer_longer(series):
        return max(series, key=len)

    # group by the cleaned and sorted award names and add the frequencies together
    grouped_award_counts = cleaned_award_counts.groupby('Sorted_Award_Name').agg({
        'Cleaned_Award_Name': prefer_longer,  # keep the longest version
        'Frequency': 'sum'  # add up their frequencies
    }).reset_index(drop=True)

    grouped_award_counts = grouped_award_counts[grouped_award_counts['Frequency'] > 1]
    grouped_award_counts = grouped_award_counts.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    return grouped_award_counts

#award_names = extract_award_names(tweets)
#print(award_names)
