# INIT

In [273]:
import requests as rq
import re
import traceback
import datetime
from collections import Counter
from bs4 import BeautifulSoup


from storage import session_scope, MovieAward
from constants import GLOBES_BEST_DRAMA, GLOBES_BEST_COMEDY, CRITICS_BEST_FILM, \
    PGA_BEST_FILM, DGA_BEST_FILM, BAFTA_BEST_FILM, CANNES_BEST_FILM, SAG_BEST_FILM

In [83]:
WIKI_BASE = 'https://en.wikipedia.org'

In [3]:
def save_awards(category, awards):
    with session_scope() as session:
        movie_awards = [
             MovieAward(award_category=category,
                        movie_title=title,
                        movie_wiki_url=wiki_url,
                        winner=winner,
                        award_date=datetime.date(int(year), 1, 1))
             for year, title, wiki_url, winner in awards
        ]
        session.bulk_save_objects(movie_awards)

# Cannes

In [202]:
winners_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Palme_d%27Or').text, 'lxml')

In [208]:
winner_elements = winners_soup.find('div', {'id': 'Palme_d&#039;Or_winning_films'}).findNext('ul').find_all('li')

In [210]:
winner_elements = winners_soup.find('div', {'id': 'Palme_d&#039;Or_winning_films'}).findNext('ul').find_all('li')
winners = dict()
for wel in winner_elements:
    year = int(re.search('[\d]{4}', wel.text).group(0))
    a = wel.find('a')
    href = a.get('href')
    title = a.get('title')
    winners[href] = (year, title)

In [232]:
table_years = set([1991, 1993, 1994] + list(range(2007, 2019)))

In [255]:
cannes_results = []
for year in range(1970, 2019):
    print(year)
    soup = BeautifulSoup(rq.get(f'https://en.wikipedia.org/wiki/{year}_Cannes_Film_Festival').text, 'lxml')
    tag = next(x for x in soup.find_all('span', {'class': 'mw-headline'}) if x.text.lower().startswith('in competition'))
    if not tag:
        raise
    if year in table_years:
        elements = tag.findNext('tbody').find_all('tr')[1:]
    else:
        elements = tag.findNext('ul').find_all('li')
    for el in elements:
        a = el.findNext('a')
        href, title = a.get('href'), a.get('title')
        winner = href in winners
        cannes_results.append((year, title, href, winner))

1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [266]:
save_awards(CANNES_BEST_FILM, list(set(cannes_results)))

# SAG

In [267]:
sag_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Cast_in_a_Motion_Picture').text, 'lxml')

In [269]:
sag_results = []
current_year = 1
for table in sag_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 3:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        elif len(columns) == 2:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}")
            
        winner = film_col.get('style') == 'background:#FAEB86;'
        try:
            a = film_col.find('a')
            sag_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
            

In [274]:
save_awards(SAG_BEST_FILM, sag_results)

# BAFTA

In [187]:
bafta_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Film').text, 'lxml')

In [199]:
bafta_results = []
current_year = 1
for table in bafta_soup.find_all('table', {'class': 'wikitable'})[2:]:
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 1:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            continue
        elif len(columns) == 5:
            film_col = columns[1]
        elif len(columns) == 4:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}")
            
        winner = film_col.get('style') == 'background:#ccc;'
        try:
            a = film_col.find('a')
            bafta_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
            

In [201]:
save_awards(BAFTA_BEST_FILM, bafta_results)

# DGA

In [176]:
dga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Directors_Guild_of_America_Award_for_Outstanding_Directing_%E2%80%93_Feature_Film').text, 'lxml')

In [180]:
dga_results = []
current_year = 1
for table in dga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[2]
        else:
            film_col = columns[1]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            dga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()

In [185]:
save_awards(DGA_BEST_FILM, dga_results)

# PGA

In [170]:
pga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Producers_Guild_of_America_Award_for_Best_Theatrical_Motion_Picture').text, 'lxml')

In [172]:
pga_results = []
current_year = 1
for table in pga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            pga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()

Problem with <tr>
<td rowspan="5" style="text-align:center">2
</td>
<td>7
</td>
<td><a href="/wiki/Steven_Spielberg" title="Steven Spielberg">Steven Spielberg</a> (<b>1993</b>, 1997, <b>1998</b>, 2011, 2012, 2015, 2017)
</td></tr>
Problem with <tr>
<td>4
</td>
<td><a href="/wiki/Jeremy_Kleiner" title="Jeremy Kleiner">Jeremy Kleiner</a> (<b>2013</b>, <b>2015</b>, 2016, 2018)
</td></tr>
Problem with <tr>
<td>4
</td>
<td><a href="/wiki/Dede_Gardner" title="Dede Gardner">Dede Gardner</a> (<b>2013</b>, <b>2015</b>, 2016, 2018)
</td></tr>
Problem with <tr>
<td>3
</td>
<td><a href="/wiki/Brad_Pitt" title="Brad Pitt">Brad Pitt</a> (2011, <b>2013</b>, <b>2015</b>)
</td></tr>
Problem with <tr>
<td>2
</td>
<td><a href="/wiki/Branko_Lustig" title="Branko Lustig">Branko Lustig</a> (<b>1993</b>, <b>2000</b>)
</td></tr>
Problem with <tr>
<td rowspan="32" style="text-align:center">1
</td>
<td>9
</td>
<td><a href="/wiki/Scott_Rudin" title="Scott Rudin">Scott Rudin</a> (<b>2007</b>, 2010, 2011, 2012, 20

Traceback (most recent call last):
  File "<ipython-input-172-0b009993f56c>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-172-0b009993f56c>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-172-0b009993f56c>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-172-0b009993f56c>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-172-0b009993f56c>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call la

In [175]:
save_awards(PGA_BEST_FILM, pga_results)

# Golden Globes DRAMA

In [2]:
soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Drama').text, 'lxml')

In [4]:
globe_drama_results = []
current_year = 1
for table in soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        a = film_col.find('i').find('a')
        globe_drama_results.append((current_year, a.get('title'), a.get('href'), winner))
save_awards(GLOBES_BEST_DRAMA, globe_drama_results)

IntegrityError: (sqlite3.IntegrityError) UNIQUE constraint failed: movie_awards.award_category, movie_awards.movie_wiki_url, movie_awards.award_date, movie_awards.winner [SQL: 'INSERT INTO movie_awards (award_category, movie_wiki_url, award_date, winner, movie_title) VALUES (?, ?, ?, ?, ?)'] [parameters: (('globes_drama', '/wiki/The_Song_of_Bernadette_(film)', '1943-01-01', 1, 'The Song of Bernadette (film)'), ('globes_drama', '/wiki/Going_My_Way', '1944-01-01', 1, 'Going My Way'), ('globes_drama', '/wiki/The_Lost_Weekend_(film)', '1945-01-01', 1, 'The Lost Weekend (film)'), ('globes_drama', '/wiki/The_Best_Years_of_Our_Lives', '1946-01-01', 1, 'The Best Years of Our Lives'), ('globes_drama', '/wiki/Gentleman%27s_Agreement', '1947-01-01', 1, "Gentleman's Agreement"), ('globes_drama', '/wiki/Johnny_Belinda_(1948_film)', '1948-01-01', 1, 'Johnny Belinda (1948 film)'), ('globes_drama', '/wiki/The_Treasure_of_the_Sierra_Madre_(film)', '1948-01-01', 1, 'The Treasure of the Sierra Madre (film)'), ('globes_drama', '/wiki/All_the_King%27s_Men_(1949_film)', '1949-01-01', 1, "All the King's Men (1949 film)")  ... displaying 10 of 360 total bound parameter sets ...  ('globes_drama', '/wiki/If_Beale_Street_Could_Talk_(film)', '2018-01-01', 0, 'If Beale Street Could Talk (film)'), ('globes_drama', '/wiki/A_Star_Is_Born_(2018_film)', '2018-01-01', 0, 'A Star Is Born (2018 film)'))] (Background on this error at: http://sqlalche.me/e/gkpj)

# Golden Globes COMEDY

In [7]:
globes_comedy_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Musical_or_Comedy').text, 'lxml')

In [10]:
globe_comedy_results = []
current_year = 1
for table in globes_comedy_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            globe_comedy_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()

Problem with <tr>
<td style="text-align:center;"><a href="/wiki/11th_Golden_Globe_Awards" title="11th Golden Globe Awards">1953</a>
</td>
<td colspan="3" style="text-align:center;"><b>No Award given.</b>
</td></tr>
Problem with <tr>
<td rowspan="5" style="text-align:center;"><a href="/wiki/16th_Golden_Globe_Awards" title="16th Golden Globe Awards">1958</a></td>
<td style="background:#b0c4de; text-align:left;"><i><b><a href="/wiki/Auntie_Mame_(film)" title="Auntie Mame (film)">Auntie Mame</a></b></i> *</td>
<td style="background:#B0C4DE;"><b><a href="/wiki/Morton_DaCosta" title="Morton DaCosta">Morton DaCosta</a></b></td>
<td style="background:#B0C4DE;"><b>Morton DaCosta</b></td>
<td style="background:#90ee90; text-align:left;"><i><b><a href="/wiki/Gigi_(1958_film)" title="Gigi (1958 film)">Gigi</a></b></i> †</td>
<td style="background:#90EE90;"><b><a href="/wiki/Vincente_Minnelli" title="Vincente Minnelli">Vincente Minnelli</a></b></td>
<td style="background:#90EE90;"><b>Arthur Freed</

Traceback (most recent call last):
  File "<ipython-input-10-dd730b002cbe>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-10-dd730b002cbe>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-10-dd730b002cbe>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-10-dd730b002cbe>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-10-dd730b002cbe>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):


In [14]:
save_awards(GLOBES_BEST_COMEDY, globe_comedy_results)

# Oscar Best Film

In [35]:
oscar_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture').text, 'lxml')

In [63]:
oscar_results = []
current_year = 1
for table in oscar_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 1:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
        elif len(columns) == 2:
            film_col = columns[0]
            if row.get('style') == 'background:#FAEB86':
                winner = True
            else:
                winner = False
            try:
                a = film_col.find('i').find('a')
                oscar_results.append((current_year, a.get('title'), a.get('href'), winner))
            except:
                print(f"Problem with {row}")
                traceback.print_exc()
        else:
            print(f"Wrong number of columns in {row}")

Wrong number of columns in <tr>
<td><a href="/wiki/20th_Century_Fox" title="20th Century Fox">20th Century Fox</a>
</td>
<td>60
</td>
<td>8
</td></tr>
Wrong number of columns in <tr>
<td><a href="/wiki/Columbia_Pictures" title="Columbia Pictures">Columbia Pictures</a>
</td>
<td>56
</td>
<td>12
</td></tr>
Wrong number of columns in <tr>
<td><a href="/wiki/Metro-Goldwyn-Mayer" title="Metro-Goldwyn-Mayer">Metro-Goldwyn-Mayer</a>
</td>
<td>40
</td>
<td>9
</td></tr>
Wrong number of columns in <tr>
<td><a href="/wiki/Universal_Pictures" title="Universal Pictures">Universal Pictures</a>
</td>
<td>34
</td>
<td>8
</td></tr>
Wrong number of columns in <tr>
<td><a href="/wiki/Warner_Bros." title="Warner Bros.">Warner Bros. Pictures</a>
</td>
<td>25
</td>
<td>9
</td></tr>
Wrong number of columns in <tr>
<td><a href="/wiki/Paramount_Pictures" title="Paramount Pictures">Paramount Pictures</a>
</td>
<td>20
</td>
<td>11
</td></tr>
Wrong number of columns in <tr>
<td><a href="/wiki/Fox_Searchlight_Pict

In [69]:
save_awards('oscars', oscar_results)

# Critics Choice

In [88]:
cc_all_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Critics%27_Choice_Movie_Awards').text, 'lxml')

In [89]:
cc_urls = [li.find('a').get('href')
           for li in cc_all_soup.find_all('td', {'class': 'navbox-list navbox-odd hlist'})[1].find_all('li')][1:]

In [129]:
results_1996_2000 = []
for year, cc_year_url in list(zip(range(1996, 2001), cc_urls[:5])):
    cc_soup = BeautifulSoup(rq.get(WIKI_BASE + cc_year_url).text, 'lxml')
    winner_url = [x for x in cc_soup.find_all('b') if x.text.startswith('Best Picture')][0].findNext('a').get('href')
    for x in cc_soup.find('ol').find_all('li'):
        a = x.find('a')
        href, title = a.get('href'), a.get('title')
        winner = href == winner_url
        results_1996_2000.append((year, title, href, winner))

In [147]:
results_2001_2013 = []
for year, cc_year_url in list(zip(range(2001, 2014), cc_urls[5:18])):
    print(year)
    cc_soup = BeautifulSoup(rq.get(WIKI_BASE + cc_year_url).text, 'lxml')
    winner_tag = cc_soup.find('span', {'class': 'mw-headline', 'id': 'Best_Picture'}).findNext('p').find('a')
    results_2001_2013.append((year, winner_tag.get('title'), winner_tag.get('href'), True))
    for a in winner_tag.findNext('ul').find_all('a'):
        results_2001_2013.append((year, a.get('title'), a.get('href'), False)) 

2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013


In [161]:
results_2014_2018 = []
for year, cc_year_url in list(zip(range(2014, 2019), cc_urls[18:])):
    print(year)
    cc_soup = BeautifulSoup(rq.get(WIKI_BASE + cc_year_url).text, 'lxml')
    winner = True
    for a in next(x for x in cc_soup.find_all('div') if x.text.startswith('Best Picture')).findNext('ul').find_all('a'):
        results_2014_2018.append((year, a.get('title'), a.get('href'), winner))
        winner = False

2014
2015
2016
2017
2018


In [167]:
save_awards(CRITICS_BEST_FILM, results_1996_2000 + results_2001_2013 + results_2014_2018)