# 1-Scraping Wikipedia for Various Awards Show Winners
- Using the theory that previous winners may be indicative of Oscar wins, I look at various awards shows to find winners
- This code is inspired heavily by Github User Buzdygan, and I owe my scraping success to him
- The code is merged into my ML table in table_assembling.ipynb

In [1]:
import requests as rq
import re
import datetime
import traceback
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import codecs
import lxml

## Oscar DataFrame (Best Picture Only)

In [2]:
oscar_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture').text, 'lxml')

oscar_results = []
current_year = 1
#itearate through all table tags in html that have class 'wikitable'
for table in oscar_soup.find_all('table', {'class': 'wikitable'}):
    #iterate thorugh rows skipping table headers
    for row in table.find_all('tr')[1:]:
        #each table data element in a row represents a column
        columns = row.find_all('td')
        #row of just one column conatins a year the following list corresponds to
        if len(columns) == 1:
            #the value in the cell is transformed into int and assigned to current_year
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
        # if only two columns row has info about a single winner
        elif len(columns) == 2:
            #name of the film in first column
            film_col = columns[0]
            #this is a background used by wiki to indicate a winner
            if row.get('style') == 'background:#FAEB86':
                winner = True
            else:
                winner = False
            #this line tries to extract title and url from a element and append those to a list
            try:
                a = film_col.find('i').find('a')
                title = a.get('title')
                title_bare = title.split('(')[0]
                oscar_results.append((current_year, title_bare, a.get('href'), winner))
            except:
                traceback.print_exc()
        else:
            continue

pd.DataFrame(oscar_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/osc_bp.csv', index = False)

# Oscar Nomination Count

In [23]:
# This table only includes films which have won at least one award
url = 'https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films'
page = requests.get(url, 'lxml')
nom_soup = BeautifulSoup(page.content)
tables = nom_soup.findAll("table", {"class": "wikitable"})

# Extract wikipedia names
for table in tables:
    links = table.findAll('a')
    t = pd.read_html(table.prettify())
    titles_text, titles, hrefs = [],[],[]
    for link in links:
        titles.append(link.get('title'))
        titles_text.append(link.text)
        hrefs.append(link.get('href'))
        
# Prepare bs4 dataFrame for merge
#this no_no variable is set to filter out all a elements that are links to wiki pages such as 2022 in film
no_no = 'in film'
col_names = ['Film','film','wiki']
to_scrape_df = pd.DataFrame([(titles_text[i],titles[i],hrefs[i]) for i in range(len(titles)) if no_no not in str(titles[i])], columns = col_names)
to_scrape_df = to_scrape_df.groupby('Film').max().reset_index() # removes repeats
# Prepare pd table for merge
def remove_parens(x):
    return x.split('(')[0].strip()

#clean up the first column of t DF which is generated from the first table in each tables in the tables list
t[0]['film_clean'] = t[0]['Film'].apply(remove_parens)
t[0]['film_dirty'] = t[0]['Film']
t[0]['Film'] = t[0]['film_clean']
t[0] = t[0].groupby('Film').max().reset_index()

def remove_bracks(x):
    return int(x.split('[')[0].strip())

# Merge to get nominations and various name permutations
ml_df = pd.merge(to_scrape_df, t[0], on = 'Film', how = 'inner')[['Year','film','wiki','Nominations','Film']]
ml_df['Nominations'] = ml_df['Nominations'].apply(remove_bracks)
col_list = ['year','film_dirty','wiki','nominations','film']
ml_df.columns = col_list
col_list[1], col_list[4] = col_list[4], col_list[1]
ml_df = ml_df[col_list]
ml_df = ml_df.drop('film_dirty', axis=1)
ml_df.to_csv('./data/scraping_results/noms.csv', index = False)

# Directors Guild Awards

In [10]:
dga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Directors_Guild_of_America_Award_for_Outstanding_Directing_%E2%80%93_Feature_Film').text, 'lxml')

In [11]:
dga_results = []
current_year = 1
for table in dga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[2]
        else:
            film_col = columns[1]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            dga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
pd.DataFrame(dga_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/dgas.csv', index = False)

# BAFTAs

In [12]:
bafta_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Film').text, 'lxml')
# here the code is a bit different because year in not stored in <tr> with one <td> but in <tr> with 5 <td> in a first <td> 
bafta_results = []
current_year = 1
for table in bafta_soup.find_all('table', {'class': 'wikitable'})[2:]:
    year = 1947
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 1:
            # check if year is in this td element or in one with rowspan attribute
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            continue
        elif len(columns) == 5:
            # year_td = row.find('td', {'rowspan': True})
            # if year_td is not None:
            #     current_year = int(year_td.find('b').text)
            year_col=columns[0]
            current_year = year_col.find('b').text
            film_col = columns[1]
        # its 4 when it aso contains a country and 3 if it has a country specified above
        elif len(columns) == 4 or len(columns) == 3:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}", len(columns))
            
        winner = film_col.get('style') == 'background:#ccc;'
        try:
            a = film_col.find('a')
            bafta_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
pd.DataFrame(bafta_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/bafta.csv', index = False)

# Producers Guild Awards

In [14]:
pga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Producers_Guild_of_America_Award_for_Best_Theatrical_Motion_Picture').text, 'lxml')

In [15]:
pga_results = []
current_year = 1
for table in pga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            if film_col.find('i') is not None:
                a = film_col.find('i').find('a')
                pga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            traceback.print_exc()

pd.DataFrame(pga_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/pga.csv', index = False)

## Screen Actors Guild Awards (Ensemble Only)

In [16]:
sag_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Cast_in_a_Motion_Picture').text, 'lxml')

In [17]:
sag_results = []
current_year = 1
for table in sag_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 3:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        elif len(columns) == 2:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}")
            
        winner = film_col.get('style') == 'background:#FAEB86;'
        try:
            a = film_col.find('a')
            sag_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
            
sag_df = pd.DataFrame(sag_results, columns = ['year','film','wiki','winner'])
sag_df.to_csv('./data/scraping_results/sag_ensemble.csv', index = False)

## Golden Globes (Two-Parter: Drama and Comedy)

In [18]:
soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Drama').text, 'lxml')

globe_drama_results = []
current_year = 1
pattern = re.compile(r"\bGolden Globe Awards\b")
for table in soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]

        elif pattern.search(columns[0].find('a').get('title')):
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        
        a = film_col.find('i').find('a')
        title = a.get('title')
        title_bare = title.split('(')[0]
        globe_drama_results.append((current_year, title_bare, a.get('href'), winner))

pd.DataFrame(globe_drama_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/gg_drama.csv', index = False)

In [19]:
globes_comedy_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Musical_or_Comedy').text, 'lxml')

globe_comedy_results = []
current_year = 1
for table in globes_comedy_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        try:
            if film_col.find('i') is not None:
                a = film_col.find('i').find('a')
                title = a.get('title')
                title_bare = title.split('(')[0]
                globe_comedy_results.append((current_year, title_bare, a.get('href'), winner))
        except:
            traceback.print_exc()

pd.DataFrame(globe_comedy_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/gg_comedy.csv', index = False)

# Results
The results of scraping the above Wikipedia pages has given us one DataFrame for each awards show (two for Golden Globes) and a DataFrame with the nominations for every oscar-winning film. These will all be merged together in the [table assembling notebook](https://github.com/njparker1993/oscars_predictions/blob/master/table_assembling.ipynb) to become ML ready.
Since the scraping was all done one wikipedia, the movies were able to be kept constant. Below is an example of what one Awards show DataFrame looks like

In [15]:
sag_df.head(5)

Unnamed: 0,year,film,wiki,winner
0,1995,Apollo 13 (film),/wiki/Apollo_13_(film),True
1,1995,Get Shorty (film),/wiki/Get_Shorty_(film),False
2,1995,How to Make an American Quilt,/wiki/How_to_Make_an_American_Quilt,False
3,1995,Nixon (film),/wiki/Nixon_(film),False
4,1995,Sense and Sensibility (film),/wiki/Sense_and_Sensibility_(film),False
