In [15]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

In [16]:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
from nltk.tokenize import RegexpTokenizer

In [18]:
df = pd.read_json(path_or_buf='gg2013.json')

# Method 1: get award names from the official website of Golden Globes

In [19]:
def get_awards_url(year):
    Html = urlopen('https://www.goldenglobes.com/winners-nominees/'+year+'/all').read()
    soup = BeautifulSoup(Html,"html.parser")
    webAll = soup.findAll("a", {"href": re.compile("/winners-nominees/[0-9]+/all#category-")})
    award_names = []
    count = 0
    for match in webAll:
        award_names.append(match.string)
        count = count+1
    print('Award names from the official website of Golden Globes in '+year+':')
    print(award_names)
    return award_names

Award names from the official website of Golden Globes in 2013

In [26]:
get_awards_url('2013')

['Best Motion Picture - Drama',
 'Best Motion Picture - Musical or Comedy',
 'Best Performance by an Actress in a Motion Picture - Drama',
 'Best Performance by an Actor in a Motion Picture - Drama',
 'Best Performance by an Actress in a Motion Picture - Musical or Comedy',
 'Best Performance by an Actor in a Motion Picture - Musical or Comedy',
 'Best Performance by an Actress in a Supporting Role in any Motion Picture',
 'Best Performance by an Actor in a Supporting Role in any Motion Picture',
 'Best Director - Motion Picture',
 'Best Screenplay - Motion Picture',
 'Best Motion Picture - Animated',
 'Best Motion Picture - Foreign Language',
 'Best Original Score - Motion Picture',
 'Best Original Song - Motion Picture',
 'Best Television Series - Drama',
 'Best Television Series - Musical or Comedy',
 'Best Television Limited Series or Motion Picture Made for Television',
 'Best Performance by an Actress in a Limited Series or a Motion Picture Made for Television',
 'Best Performanc

Award names from the official website of Golden Globes in 2015

In [30]:
get_awards_url('2015')

['Best Motion Picture - Drama',
 'Best Motion Picture - Musical or Comedy',
 'Best Performance by an Actress in a Motion Picture - Drama',
 'Best Performance by an Actor in a Motion Picture - Drama',
 'Best Performance by an Actress in a Motion Picture - Musical or Comedy',
 'Best Performance by an Actor in a Motion Picture - Musical or Comedy',
 'Best Performance by an Actress in a Supporting Role in any Motion Picture',
 'Best Performance by an Actor in a Supporting Role in any Motion Picture',
 'Best Director - Motion Picture',
 'Best Screenplay - Motion Picture',
 'Best Motion Picture - Animated',
 'Best Motion Picture - Foreign Language',
 'Best Original Score - Motion Picture',
 'Best Original Song - Motion Picture',
 'Best Television Series - Drama',
 'Best Television Series - Musical or Comedy',
 'Best Television Limited Series or Motion Picture Made for Television',
 'Best Performance by an Actress in a Limited Series or a Motion Picture Made for Television',
 'Best Performanc

# Method 2: get award names from the json file

In [31]:

def cleanse(line):
    # replace everything to ' ' except whitespace, alphanumeric character, apostrophe, hashtag, @
    return re.sub(r'[^\w\s\'#@]', ' ', line)

In [32]:
def get_tweets(year):
    twitterHandle = '@goldenglobes'
    twitterHandleText = twitterHandle[1:]

    df = pd.read_json(path_or_buf='gg' + year + '.json')
    data = df['text']

    res = []
    tokenized_tweets = []

    for tweet in data:
        if twitterHandle in tweet and 'RT' not in tweet:
            temp_tweet = RegexpTokenizer(r'\w+').tokenize(tweet)
            if twitterHandleText == temp_tweet[0]:
                res.append(temp_tweet)
        tweet = RegexpTokenizer(r'\w+').tokenize(tweet)
        tokenized_tweets.append(tweet)

    return res


In [45]:
def get_awards(year):

    keyWords = ['Best', 'best', 'Motion', 'motion', 'Picture', 'picture', 'Drama', 'drama', 'Performance', 'performance', 'Actress', 'actress', 'Actor', 'actor','Comedy', 'comedy', 'Musical', 'musical', 'Animated', 'animated', 'Feature', 'feature', 'Film', 'film', 'Foreign', 'foreign', 'Language', 'language', 'Supporting', 'supporting', 'Role', 'role', 'Director', 'director', 'Screenplay', 'screenplay', 'Original', 'orginal', 'Score', 'score', 'Song', 'song', 'Television', 'television', 'Series', 'series', 'Mini-series',  'mini-series', 'mini', 'Mini']
    connectWords = ['by','By','An','an','In','in','A','a','For','for','-',':','Or','or']

    Tweets = get_tweets(year)
    print('The number of tweets includes award names is '+ str(len(Tweets)))
    num = 0
    # awards' names
    awards = []
    # tweets include words > 3
    awardTweets = []

    # if exists award name
    for item in Tweets:
        if len(set(keyWords).intersection(set(item))) > 3:
            num = num + 1
            awardTweets.append(sorted(set(item), key=lambda x: item.index(x)))
    print('The number of award names got from the json file is '+ str(num))
    
    for item in awardTweets:
        signIndex = len(item)-1
        for word in keyWords:
            if word in item:
                index = item.index(word)
                if index < signIndex:
                    signIndex = index
        flag = True
        temp = []
        for word in item:
            if word not in keyWords and word not in connectWords and item.index(word) >= signIndex:
                flag = False
            if word in keyWords or word in connectWords and item.index(word) >= signIndex and flag:
                # if word not in helper_words:
                temp.append(word.lower())
        awardString = ' '.join(sorted(set(temp), key=lambda x: temp.index(x)))
        if awardString not in awards:
            awards.append(awardString)
    for x in awards:
        if x.split()[0] != 'best':
            awards.remove(x)

    set_awards = set(awards)
    awards = []
    encoded_awards = []

    print('\n')
    for x in set_awards:
        x_encoded = x.encode("utf-8")
        encoded_awards.append(x_encoded)
    awards = encoded_awards

    return awards, num

Award names from the json file of Golden Globes in 2013

In [46]:
get_awards('2013')

The number of tweets includes award names is 618
The number of award names got from the json file is 87




([b'best actress in a motion picture comedy or musical',
  b'best television series actor drama',
  b'best actress in a motion picture musical or comedy',
  b'best supporting actor in a motion picture',
  b'best supporting actor actress',
  b'best supporting actress in a motion picture',
  b'best actor in a motion picture comedy or musical',
  b'best motion picture comedy or musical',
  b'best motion picture drama',
  b'best television series comedy or musical',
  b'best supporting actor in a television series',
  b'best motion picture television',
  b'best actor in a motion picture drama',
  b'best actor in a motion picture drama performance',
  b'best actress in a television series drama',
  b'best television series drama',
  b'best supporting actress in a series',
  b'best actor in a television series comedy or musical',
  b'best actor in a comedy or musical',
  b'best animated feature film',
  b'best actor comedy or musical',
  b'best actress in a motion picture drama'],
 87)

Award names from the json file of Golden Globes in 2013

In [48]:
get_awards('2015')

The number of tweets includes award names is 8557
The number of award names got from the json file is 875




([b'best actor in a television series comedy or musical performance',
  b'best series actress comedy musical',
  b'best actress in a motion picture comedy or musical',
  b'best television series actor drama',
  b'best actress in a motion musical',
  b'best supporting actor in a motion picture',
  b'best supporting actress in a motion picture',
  b'best actor in a motion picture comedy or musical',
  b'best motion picture comedy or musical',
  b'best animated motion picture',
  b'best actor drama performance',
  b'best supporting actor in a motion picture performance',
  b'best actor in a mini series or',
  b'best series actress comedy musical original',
  b'best motion picture comedy actor performance',
  b'best supporting actor performance',
  b'best picture comedy musical',
  b'best motion picture drama',
  b'best actress in comedy musical for',
  b'best television series comedy or musical',
  b'best actor in a motion picture comedy or musical performance',
  b'best supporting actor 