# INIT

In [None]:
import requests as rq
import re
import traceback
import datetime
from collections import Counter
from bs4 import BeautifulSoup

from storage import session_scope, MovieAward
from constants import GLOBES_BEST_DRAMA, GLOBES_BEST_COMEDY, CRITICS_BEST_FILM, \
    PGA_BEST_FILM, DGA_BEST_FILM, BAFTA_BEST_FILM, CANNES_BEST_FILM, SAG_BEST_FILM

In [None]:
WIKI_BASE = 'https://en.wikipedia.org'

In [None]:
def save_awards(category, awards):
    with session_scope() as session:
        movie_awards = [
             MovieAward(award_category=category,
                        movie_title=title,
                        movie_wiki_url=wiki_url,
                        winner=winner,
                        award_date=datetime.date(int(year), 1, 1))
             for year, title, wiki_url, winner in awards
        ]
        session.bulk_save_objects(movie_awards)

# Cannes

In [None]:
winners_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Palme_d%27Or').text, 'lxml')

In [None]:
winner_elements = winners_soup.find('div', {'id': 'Palme_d&#039;Or_winning_films'}).findNext('ul').find_all('li')

In [None]:
winner_elements = winners_soup.find('div', {'id': 'Palme_d&#039;Or_winning_films'}).findNext('ul').find_all('li')
winners = dict()
for wel in winner_elements:
    year = int(re.search('[\d]{4}', wel.text).group(0))
    a = wel.find('a')
    href = a.get('href')
    title = a.get('title')
    winners[href] = (year, title)

In [None]:
table_years = set([1991, 1993, 1994] + list(range(2007, 2019)))

In [None]:
cannes_results = []
for year in range(1970, 2019):
    print(year)
    soup = BeautifulSoup(rq.get(f'https://en.wikipedia.org/wiki/{year}_Cannes_Film_Festival').text, 'lxml')
    tag = next(x for x in soup.find_all('span', {'class': 'mw-headline'}) if x.text.lower().startswith('in competition'))
    if not tag:
        raise
    if year in table_years:
        elements = tag.findNext('tbody').find_all('tr')[1:]
    else:
        elements = tag.findNext('ul').find_all('li')
    for el in elements:
        a = el.findNext('a')
        href, title = a.get('href'), a.get('title')
        winner = href in winners
        cannes_results.append((year, title, href, winner))

In [None]:
save_awards(CANNES_BEST_FILM, list(set(cannes_results)))

# SAG

In [None]:
sag_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Cast_in_a_Motion_Picture').text, 'lxml')

In [None]:
sag_results = []
current_year = 1
for table in sag_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 3:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        elif len(columns) == 2:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}")
            
        winner = film_col.get('style') == 'background:#FAEB86;'
        try:
            a = film_col.find('a')
            sag_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
            

In [None]:
save_awards(SAG_BEST_FILM, sag_results)

# BAFTA

In [None]:
bafta_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Film').text, 'lxml')

In [None]:
bafta_results = []
current_year = 1
for table in bafta_soup.find_all('table', {'class': 'wikitable'})[2:]:
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 1:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            continue
        elif len(columns) == 5:
            film_col = columns[1]
        elif len(columns) == 4:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}")
            
        winner = film_col.get('style') == 'background:#ccc;'
        try:
            a = film_col.find('a')
            bafta_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
            

In [None]:
save_awards(BAFTA_BEST_FILM, bafta_results)

# DGA

In [None]:
dga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Directors_Guild_of_America_Award_for_Outstanding_Directing_%E2%80%93_Feature_Film').text, 'lxml')

In [None]:
dga_results = []
current_year = 1
for table in dga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[2]
        else:
            film_col = columns[1]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            dga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()

In [None]:
save_awards(DGA_BEST_FILM, dga_results)

# PGA

In [None]:
pga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Producers_Guild_of_America_Award_for_Best_Theatrical_Motion_Picture').text, 'lxml')

In [None]:
pga_results = []
current_year = 1
for table in pga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            pga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()

In [None]:
save_awards(PGA_BEST_FILM, pga_results)

# Golden Globes DRAMA

In [None]:
soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Drama').text, 'lxml')

In [None]:
globe_drama_results = []
current_year = 1
for table in soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        a = film_col.find('i').find('a')
        globe_drama_results.append((current_year, a.get('title'), a.get('href'), winner))
save_awards(GLOBES_BEST_DRAMA, globe_drama_results)

# Golden Globes COMEDY

In [None]:
globes_comedy_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Musical_or_Comedy').text, 'lxml')

In [None]:
globe_comedy_results = []
current_year = 1
for table in globes_comedy_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            globe_comedy_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()

In [None]:
save_awards(GLOBES_BEST_COMEDY, globe_comedy_results)

# Oscar Best Film

In [None]:
oscar_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture').text, 'lxml')

In [None]:
oscar_results = []
current_year = 1
for table in oscar_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 1:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
        elif len(columns) == 2:
            film_col = columns[0]
            if row.get('style') == 'background:#FAEB86':
                winner = True
            else:
                winner = False
            try:
                a = film_col.find('i').find('a')
                oscar_results.append((current_year, a.get('title'), a.get('href'), winner))
            except:
                print(f"Problem with {row}")
                traceback.print_exc()
        else:
            print(f"Wrong number of columns in {row}")

In [None]:
save_awards('oscars', oscar_results)

# Critics Choice

In [None]:
cc_all_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Critics%27_Choice_Movie_Awards').text, 'lxml')

In [None]:
cc_urls = [li.find('a').get('href')
           for li in cc_all_soup.find_all('td', {'class': 'navbox-list navbox-odd hlist'})[1].find_all('li')][1:]

In [None]:
results_1996_2000 = []
for year, cc_year_url in list(zip(range(1996, 2001), cc_urls[:5])):
    cc_soup = BeautifulSoup(rq.get(WIKI_BASE + cc_year_url).text, 'lxml')
    winner_url = [x for x in cc_soup.find_all('b') if x.text.startswith('Best Picture')][0].findNext('a').get('href')
    for x in cc_soup.find('ol').find_all('li'):
        a = x.find('a')
        href, title = a.get('href'), a.get('title')
        winner = href == winner_url
        results_1996_2000.append((year, title, href, winner))

In [None]:
results_2001_2013 = []
for year, cc_year_url in list(zip(range(2001, 2014), cc_urls[5:18])):
    print(year)
    cc_soup = BeautifulSoup(rq.get(WIKI_BASE + cc_year_url).text, 'lxml')
    winner_tag = cc_soup.find('span', {'class': 'mw-headline', 'id': 'Best_Picture'}).findNext('p').find('a')
    results_2001_2013.append((year, winner_tag.get('title'), winner_tag.get('href'), True))
    for a in winner_tag.findNext('ul').find_all('a'):
        results_2001_2013.append((year, a.get('title'), a.get('href'), False)) 

In [None]:
results_2014_2018 = []
for year, cc_year_url in list(zip(range(2014, 2019), cc_urls[18:])):
    print(year)
    cc_soup = BeautifulSoup(rq.get(WIKI_BASE + cc_year_url).text, 'lxml')
    winner = True
    for a in next(x for x in cc_soup.find_all('div') if x.text.startswith('Best Picture')).findNext('ul').find_all('a'):
        results_2014_2018.append((year, a.get('title'), a.get('href'), winner))
        winner = False

In [None]:
save_awards(CRITICS_BEST_FILM, results_1996_2000 + results_2001_2013 + results_2014_2018)