In [1]:
import pandas as pd
import re
from functools import partial
from ourtypes.host import Hosts
from english_words import get_english_words_set
web2lowerset = get_english_words_set(['gcide'], lower=True)
import pickle 
with open("common_words_hosts.pickle", "rb") as pckl:   
    common_words_host_list = pickle.load(pckl)

Set up tweets df

In [2]:
preprocessed_tweets = "gg2013_preprocessed.json"
tweets = pd.read_json(preprocessed_tweets, orient='records', lines=True)[["text","timestamp","is_english"]]
tweets = tweets[tweets["is_english"]] #only keep english tweets
del tweets["is_english"] #get rid of this row
tweets = tweets.dropna(subset=["text"])

In [3]:
#get hosts for entire ceremony
#get award names for the ceremony

In [4]:
def remove_common_words(text):
    #negative positive golden globes
    words = [word for word in text.split(" ") if word not in common_words_host_list]
    return " ".join(words)

In [5]:
def remove_rt(text):
    if text[0:4] == "rt ":
        text = text.split(" ")
        text = text[2:]
        return " ".join(text)
    return text

function to get hosts

In [6]:
def hosts_helper(text, host_funcs, cohost_funcs, single_cohost_funcs, hosts):
    #check for cohosts first
    for func in cohost_funcs:
        m = func.search(text)
        if m is not None:
            name1 = remove_common_words(m["name1"])
            name2 = remove_common_words(m["name2"])
            hosts.vote_host(name1, cohost=name2)
            hosts.vote_host(name2, cohost=name1)
            return name1, name2
    #check for hosts
    for func in host_funcs:
        m = func.search(text)
        if m is not None:
            name = remove_common_words(m["name"])
            hosts.vote_host(name)
            return name
    return None

In [7]:
def get_hosts(tweets):
    hosts = Hosts() #our host storage object
    host_tweets = tweets[tweets["text"].str.contains("host")] #get tweets with the word host in it
    del host_tweets["timestamp"] #we don't need this 
    host_tweets = host_tweets.map(remove_rt) #get rid of retweets
    #handle single names or no spaces?
    host_patterns = ['host (?P<name>[a-z]+ [a-z]+)', '(?P<name>[a-z]+ [a-z]+) (hosting|is hosting|will host|hosts|hosted)']
    cohost_patterns = ['(co-?|)hosts (?P<name1>[a-z]+ [a-z]+) and (?P<name2>[a-z]+ [a-z]+)',
                    '(?P<name1>[a-z]+ [a-z]+) and (?P<name2>[a-z]+ [a-z]+) (are (co-?|)hosting|will (co-?|)host|(co-?|)host|(co-?|)hosting|hosted)[^s]']
    single_cohost_patterns = ['cohost (?P<name>[a-z]+ [a-z]+)',
                    '(?P<name>[a-z]+ [a-z]+) (is cohosting|will cohost|cohosts|cohosting|cohosted)[^s]']
    #compile regex functions
    host_funcs = [re.compile(pat) for pat in host_patterns]
    cohost_funcs = [re.compile(pat) for pat in cohost_patterns]
    single_cohost_funcs = [re.compile(pat) for pat in single_cohost_patterns]

    #apply helper function 
    host_tweets = host_tweets.map(partial(hosts_helper, host_funcs=host_funcs, cohost_funcs=cohost_funcs, single_cohost_funcs=single_cohost_funcs, hosts=hosts))
    host_tweets = host_tweets.dropna(subset=["text"])
    return hosts

In [8]:
h = get_hosts(tweets)
vc = h.total_votes()
#match misspelling and single name 

tina fey,amypoehler
amy poehler,tinafey
tina fey,amypoehler
Name: tina fey, Votes: 2, Cohost: amypoehler, Cohost Votes: 2
amy poehler,tinafey
Name: amy poehler, Votes: 2, Cohost: tinafey, Cohost Votes: 2
tina fey,amypoehler
Name: tina fey, Votes: 3.5, Cohost: amypoehler, Cohost Votes: 3
amy poehler,tinafey
Name: amy poehler, Votes: 3, Cohost: tinafey, Cohost Votes: 3
tina fey,amypoehler
Name: tina fey, Votes: 5.0, Cohost: amypoehler, Cohost Votes: 4
amy poehler,tinafey
Name: amy poehler, Votes: 4, Cohost: tinafey, Cohost Votes: 4
tina fey,amypoehler
Name: tina fey, Votes: 6.0, Cohost: amypoehler, Cohost Votes: 5
amy poehler,tinafey
Name: amy poehler, Votes: 5, Cohost: tinafey, Cohost Votes: 5
tina fey,amypoehler
Name: tina fey, Votes: 7.5, Cohost: amypoehler, Cohost Votes: 6
amy poehler,tinafey
Name: amy poehler, Votes: 6.5, Cohost: tinafey, Cohost Votes: 6
tina fey,amypoehler
Name: tina fey, Votes: 8.5, Cohost: amypoehler, Cohost Votes: 7
amy poehler,tinafey
Name: amy poehler, Votes: 

In [10]:
vc

[('tina fey', 667.0, ('amypoehler', 126)),
 ('amy poehler', 199.0, ('tinafey', 126)),
 ('kristen wiig', 65.0, ('willferrell', 51)),
 ('will ferrell', 64.5, ('kristenwiig', 51)),
 ('amy should', 9.5, ('tina', 3)),
 ('amy pohler', 9, ('tinafey', 8)),
 ('amp amy', 5, None),
 ('open post', 4, None),
 ('ricky gervais', 4, None),
 ('tina should', 3, ('amy', 3)),
 ('kristin wiig', 3, ('willferrell', 3)),
 ('misses rickygervais', 2.5, None),
 ('tina feys', 2, ('amypoehler', 1)),
 ('her scar', 2.0, ('tinafey', 1)),
 ('while im', 2.0, None),
 ('girl tina', 2, None),
 ('schwarzenegger should', 2, ('stallone', 2)),
 ('tonights dual', 1.5, None),
 ('brilliant comedianswritersactresses', 1.5, None),
 ('charming people', 1.5, None),
 ('amy poheler', 1.5, ('tinafey', 1)),
 ('maggie smith', 1.5, None),
 ('tweets whos', 1.5, None),
 ('amy poehlers', 1, ('tinafey', 1)),
 ('tine fey', 1, ('amypoehler', 1)),
 ('miss norberry', 1, None),
 ('little mermaid', 1, ('princessjasmine', 1)),
 ('princess jasmine', 

Function to get awards

In [11]:
def clean_and_sort_text(text):
    # function to group award names, even if they are missing a key word or a "-"
    regex_pattern =  r"(motion+| original+|)"
    text = text.replace(regex_pattern, ' ')
    words = text.replace("-", ' ').split()
    # sort words and join back into a string
    return ' '.join(sorted(words))


def extract_award_names(df, text_column='text'):
    extracted_award_names = []
    # indentify which tweets might contain an award name
    keywords = ['wins', 'awarded', 'goes to']
    pattern = '|'.join(keywords)
    candidate_tweets = df[df[text_column].str.contains(pattern, regex=True)]

    # matching regex to all the tweets
    regex_pattern = r"(wins|awarded|goes to)(.*?)(award)?($|[,.!?:;])"
    for tweet in candidate_tweets[text_column]:
        matches = re.findall(regex_pattern, tweet)
        for match in matches:
            award_name = match[1].strip()
            if award_name:
                extracted_award_names.append(award_name)

    # clean up the results so we can rank them
    delimiters = r"( http+| for +| at +| goldenglobes +| odds-on +| and +| goes +)"
    cleaned_award_names = [re.split(delimiters, name, 1)[0] for name in extracted_award_names]
    cleaned_award_names = [name for name in cleaned_award_names if name.lower().startswith('best') and len(name.split()) > 1]
    cleaned_award_df = pd.DataFrame(cleaned_award_names, columns=['Cleaned_Award_Name'])
    cleaned_award_counts = cleaned_award_df['Cleaned_Award_Name'].value_counts().reset_index()
    cleaned_award_counts.columns = ['Cleaned_Award_Name', 'Frequency']

    cleaned_award_counts['Sorted_Award_Name'] = cleaned_award_counts['Cleaned_Award_Name'].apply(clean_and_sort_text)
    
    # chooses the longest award name of the grouping
    def prefer_longer(series):
        return max(series, key=len)

    # group by the cleaned and sorted award names and add the frequencies together
    grouped_award_counts = cleaned_award_counts.groupby('Sorted_Award_Name').agg({
        'Cleaned_Award_Name': prefer_longer,  # keep the longest version
        'Frequency': 'sum'  # add up their frequencies
    }).reset_index(drop=True)

    grouped_award_counts = grouped_award_counts[grouped_award_counts['Frequency'] > 1]
    grouped_award_counts = grouped_award_counts.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    return grouped_award_counts

award_names = extract_award_names(tweets)
print(award_names)


                                    Cleaned_Award_Name  Frequency
0                                       best director          62
1                                           best actor         46
2                                   best original song         37
3                                        best actress          36
4                                            best song         34
..                                                 ...        ...
103                best actor in a mini-seriestv movie          2
104  best performance by an actress in a tv musical...          2
105                          best actor in a tv series          2
106                                      best tv drama          2
107  best actor in a comedy or musical congrats hug...          2

[108 rows x 2 columns]


## Part Two
- have awards as input
- get presenters for awards
- get nominees for awards
- get winners for awards

In [12]:
awards = ["best performance by an actor in a television series - comedy or musical", "best performance by an actor in a television series - drama",
          "best television series - comedy or musical", "best performance by an actor in a motion picture - drama",
          "best original song - motion picture", "best animated feature film", "best performance by an actress in a mini-series or motion picture made for television",
          "best performance by an actor in a mini-series or motion picture made for television", "best television series - drama", "best performance by an actress in a supporting role in a motion picture",
          "best performance by an actor in a supporting role in a series, mini-series or motion picture made for television", "best motion picture - drama",
          "best performance by an actor in a motion picture - comedy or musical", "cecil b. demille award", "best performance by an actress in a motion picture - drama",
          "best performance by an actress in a television series - drama", "best original score - motion picture", "best mini-series or motion picture made for television",
          "best performance by an actress in a motion picture - comedy or musical", "best motion picture - comedy or musical", "best performance by an actress in a supporting role in a series, mini-series or motion picture made for television",
          "best performance by an actor in a supporting role in a motion picture", "best foreign language film", "best performance by an actress in a television series - comedy or musical",
          "best director - motion picture", "best screenplay - motion picture"]
print(len(awards))

26
