In [1]:
import pandas as pd
import re
from functools import partial
from ourtypes.category import Category
from ourtypes.award import Award
import pickle 
import numpy
from rank_bm25 import BM25Okapi
with open("common_words_hosts.pickle", "rb") as pckl:   
    common_words_host_list = pickle.load(pckl)
with open("common_words_wins.pickle", "rb") as pckl:
    common_words_winners_list = pickle.load(pckl)

Set up tweets df

In [2]:
preprocessed_tweets = "gg2013_preprocessed.json"
tweets = pd.read_json(preprocessed_tweets, orient='records', lines=True)[["text","is_english"]]
tweets = tweets[tweets["is_english"]] #only keep english tweets
del tweets["is_english"] #get rid of this row
tweets = tweets.dropna(subset=["text"])

In [3]:
#get hosts for entire ceremony
#get award names for the ceremony

In [4]:
def remove_common_words(text, word_list):
    #negative positive golden globes
    words = [word for word in text.split(" ") if word not in word_list]
    return " ".join(words)

In [5]:
def remove_rt(text):
    text = text.split(" ")
    
    if "rt" in text:
        i = text.index("rt")
        return remove_rt(" ".join(text[0:i]) + " " + " ".join(text[i+2:]))
    return " ".join(text)

function to get hosts

In [6]:
def hosts_helper(text, host_funcs, cohost_funcs, single_cohost_funcs, hosts):
    #check for cohosts first
    for func in cohost_funcs:
        m = func.search(text)
        if m is not None:
            name1 = remove_common_words(m["name1"], common_words_host_list)
            name2 = remove_common_words(m["name2"], common_words_host_list)
            hosts.vote_contender(name1, cocontender=name2)
            hosts.vote_contender(name2, cocontender=name1)
            return name1, name2
    #check for hosts
    for func in host_funcs:
        m = func.search(text)
        if m is not None:
            name = remove_common_words(m["name"], common_words_host_list)
            hosts.vote_contender(name)
            return name
    for func in single_cohost_funcs:
        m = func.search(text)
        if m is not None:
            name = remove_common_words(m["name"], common_words_host_list)
            hosts.vote_contender(name)
            return name
    return None

In [7]:
def get_hosts(tweets):
    hosts = Category(type="hosts") #our host storage object
    host_tweets = tweets[tweets["text"].str.contains("host")] #get tweets with the word host in it
    #del host_tweets["timestamp"] #we don't need this 
    host_tweets = host_tweets.map(remove_rt) #get rid of retweets
    #handle single names or no spaces?
    host_patterns = ['host (?P<name>[a-z]+ [a-z]+)', '(?P<name>[a-z]+ [a-z]+) (hosting|is hosting|will host|hosts|hosted)']
    cohost_patterns = ['(co-?|)hosts (?P<name1>[a-z]+ [a-z]+)( and | )(?P<name2>[a-z]+ [a-z]+)',
                    '(?P<name1>[a-z]+ [a-z]+)( and | )(?P<name2>[a-z]+ [a-z]+) (are (co-?|)hosting|will (co-?|)host|(co-?|)host|(co-?|)hosting|hosted)[^s]']
    single_cohost_patterns = ['cohost (?P<name>[a-z]+ [a-z]+)',
                    '(?P<name>[a-z]+ [a-z]+) (is cohosting|will cohost|cohosts|cohosting|cohosted)[^s]']
    #compile regex functions
    host_funcs = [re.compile(pat) for pat in host_patterns]
    cohost_funcs = [re.compile(pat) for pat in cohost_patterns]
    single_cohost_funcs = [re.compile(pat) for pat in single_cohost_patterns]

    #apply helper function 
    host_tweets = host_tweets.map(partial(hosts_helper, host_funcs=host_funcs, cohost_funcs=cohost_funcs, single_cohost_funcs=single_cohost_funcs, hosts=hosts))
    del host_tweets
    return hosts

In [8]:
h = get_hosts(tweets)
vc = h.total_votes()
#match misspelling and single name 

## Part Two
- have awards as input
- get presenters for awards
- get nominees for awards
- get winners for awards

In [9]:
awards = ["best performance by an actor in a television series - comedy or musical", 
          "best performance by an actor in a television series - drama",
          "best television series - comedy or musical", 
          "best performance by an actor in a motion picture - drama",
          "best original song - motion picture", 
          "best animated feature film", 
          "best performance by an actress in a mini-series or motion picture made for television",
          "best performance by an actor in a mini-series or motion picture made for television", 
          "best television series - drama", 
          "best performance by an actress in a supporting role in a motion picture",
          "best performance by an actor in a supporting role in a series, mini-series or motion picture made for television", 
          "best motion picture - drama",
          "best performance by an actor in a motion picture - comedy or musical", 
          "cecil b. demille award", 
          "best performance by an actress in a motion picture - drama",
          "best performance by an actress in a television series - drama", 
          "best original score - motion picture", 
          "best mini-series or motion picture made for television",
          "best performance by an actress in a motion picture - comedy or musical", 
          "best motion picture - comedy or musical", 
          "best performance by an actress in a supporting role in a series, mini-series or motion picture made for television",
          "best performance by an actor in a supporting role in a motion picture",
          "best foreign language film", 
          "best performance by an actress in a television series - comedy or musical",
          "best director - motion picture", "best screenplay - motion picture"]

Format awards 

In [10]:
def format_awards(awards): 
    results = {}
    person_patterns = "actor|actress|direct(ion|or)|choreo|perfomance"
    thing_patterns = "series|mini-series|movie|film|music|comedy|song|score|motion picture"
    person_func = re.compile(person_patterns)
    thing_func = re.compile(thing_patterns)
    for award in awards:
        m = person_func.search(award)
        if m is not None:
            results[award] = Award(award, winner_type="Person")
            continue
        m = thing_func.search(award)
        if m is not None:
            results[award] = Award(award, winner_type="Thing")
        else:
            results[award] = Award(award)
    return results
        

In [11]:
awards = format_awards(awards)

### Winners

add bm25 search as well

In [12]:
tweets = tweets.map(remove_rt)
corpus = tweets.to_numpy()
tokenized_corpus = [str(tweet).split(" ") for tweet in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [13]:
def bm25_search(award_name):
    q = award_name.split(" ")
    scores = bm25.get_scores(q)
    indices = numpy.argsort(scores).tolist()
    relevant = corpus[indices][::-1][:500]
    return relevant

In [None]:
won_patterns = [f"{award}( goes to | is | )(?P<name>[a-z]+ ?[a-z]+)", f"(?P<name>[a-z]+ ?[a-z]+)[just]?( wins? | on winning | won | has[ just]? won | got| )[ award for ]?{award}"]
won_funcs = [re.compile(ele) for ele in won_patterns]
presenter_patterns = [f"(?P<name>[a-z]+ ?[a-z]+)( presents | is presenting | are presenting | present )[ award for ]?"]
presenter_funcs = [re.compile(ele) for ele in presenter_patterns]

In [14]:
for award in awards.keys():
    relevant = bm25_search(award)
    for tweet in relevant:
        for f in won_funcs:
            tweet = str(tweet)
            m = f.search(tweet)
            if m is not None:
                name = m["name"]
                if awards[award].winner_type =="Person":
                    print(name)
                    name = remove_common_words(name, common_words_winners_list)
                awards[award].add_winner(name)
        for f in presenter_funcs:
            tweet = str(tweet)
            m = f.search(tweet)
            if m is not None:
                """name = remove_common_words(m["name"], common_words_winners_list)"""
                #name = m["name"]
                awards[award].add_presenter(name)

In [15]:
for v in awards.values():
    print(v)

Award: best performance by an actor in a television series - comedy or musical
Nominees: 
Winners: 
Presenters: 

Award: best performance by an actor in a television series - drama
Nominees: 
Winners: Name: damian lewis, Votes: 1
Presenters: Name: damian lewis, Votes: 2

Award: best television series - comedy or musical
Nominees: 
Winners: Name: girls hbo, Votes: 1
Name: deserved girlshbo, Votes: 1
Name: caraasrt goldenglobes, Votes: 1.5
Name: from uk, Votes: 1
Presenters: 

Award: best performance by an actor in a motion picture - drama
Nominees: 
Winners: Name: for lincoln, Votes: 1
Name: has just, Votes: 1
Name: daniel day, Votes: 1
Name: lincolns first, Votes: 1
Name: at the, Votes: 1
Name: of lincoln, Votes: 1
Presenters: Name: of lincoln, Votes: 2

Award: best original song - motion picture
Nominees: 
Winners: Name: goldenglobes httptconlbblqq, Votes: 3.5
Name: wins for, Votes: 5
Name: skyfall by, Votes: 2
Name: with skyfall, Votes: 1
Name: for winning, Votes: 1
Name: skyfall ade

In [16]:
"""l = []
with open("common_words_winner_nominees_presenters.txt", "w+") as f:
    for k in awards.keys():
        a = awards[k].winners
        c = a.contenders
        for v in c.values():
            for name in v.name.split(" "):
                if name not in l:
                    f.write(name + "\n")
                    l.append(name)"""

'l = []\nwith open("common_words_winner_nominees_presenters.txt", "w+") as f:\n    for k in awards.keys():\n        a = awards[k].winners\n        c = a.contenders\n        for v in c.values():\n            for name in v.name.split(" "):\n                if name not in l:\n                    f.write(name + "\n")\n                    l.append(name)'

In [17]:
"""with open("common_words_wins.pickle", "wb") as f:
    pickle.dump(l, f)
"""

'with open("common_words_wins.pickle", "wb") as f:\n    pickle.dump(l, f)\n'

In [18]:
def get_winners(tweets, awards):
    #awards = list of str
    awardNames = awards.keys()
    
        

## Get Award Names

Function to get awards

In [19]:
def clean_and_sort_text(text):
    # function to group award names, even if they are missing a key word or a "-"
    regex_pattern =  r"(motion+| original+|)"
    text = text.replace(regex_pattern, ' ')
    words = text.replace("-", ' ').split()
    # sort words and join back into a string
    return ' '.join(sorted(words))


def extract_award_names(df, text_column='text'):
    extracted_award_names = []
    # indentify which tweets might contain an award name
    keywords = ['wins', 'awarded', 'goes to']
    pattern = '|'.join(keywords)
    candidate_tweets = df[df[text_column].str.contains(pattern, regex=True)]

    # matching regex to all the tweets
    regex_pattern = r"(wins|awarded|goes to)(.*?)(award)?($|[,.!?:;])"
    for tweet in candidate_tweets[text_column]:
        matches = re.findall(regex_pattern, tweet)
        for match in matches:
            award_name = match[1].strip()
            if award_name:
                extracted_award_names.append(award_name)

    # clean up the results so we can rank them
    delimiters = r"( http+| for +| at +| goldenglobes +| odds-on +| and +| goes +)"
    cleaned_award_names = [re.split(delimiters, name, 1)[0] for name in extracted_award_names]
    cleaned_award_names = [name for name in cleaned_award_names if name.lower().startswith('best') and len(name.split()) > 1]
    cleaned_award_df = pd.DataFrame(cleaned_award_names, columns=['Cleaned_Award_Name'])
    cleaned_award_counts = cleaned_award_df['Cleaned_Award_Name'].value_counts().reset_index()
    cleaned_award_counts.columns = ['Cleaned_Award_Name', 'Frequency']

    cleaned_award_counts['Sorted_Award_Name'] = cleaned_award_counts['Cleaned_Award_Name'].apply(clean_and_sort_text)
    
    # chooses the longest award name of the grouping
    def prefer_longer(series):
        return max(series, key=len)

    # group by the cleaned and sorted award names and add the frequencies together
    grouped_award_counts = cleaned_award_counts.groupby('Sorted_Award_Name').agg({
        'Cleaned_Award_Name': prefer_longer,  # keep the longest version
        'Frequency': 'sum'  # add up their frequencies
    }).reset_index(drop=True)

    grouped_award_counts = grouped_award_counts[grouped_award_counts['Frequency'] > 1]
    grouped_award_counts = grouped_award_counts.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    return grouped_award_counts

award_names = extract_award_names(tweets)
print(award_names)


                                    Cleaned_Award_Name  Frequency
0                                       best director          62
1                                           best actor         46
2                                   best original song         37
3                                        best actress          36
4                              best supporting actress         34
..                                                 ...        ...
103                best actor in a mini-seriestv movie          2
104  best performance by an actress in a tv musical...          2
105                          best actor in a tv series          2
106                                      best tv drama          2
107  best actor in a comedy or musical congrats hug...          2

[108 rows x 2 columns]
