# 2-Table Assembling

This notebook assembles the results of the [scraping](https://github.com/njparker1993/oscars_predictions/blob/master/scraping.ipynb) into one dataframe into a machine learning-ready DataFrame.

In [1]:
import pandas as pd
import numpy as np

In [2]:
#nominated and winners of best picture award Oscars from 1927 (year, title, wiki, winner(T/F))
osc_scrape = pd.read_csv('./data/scraping_results/osc_bp.csv')
#oscar nominations of award winning films (year, film, link, liczba nominacji, film_text)
noms = pd.read_csv('./data/scraping_results/noms.csv')
#like osc_scrape but for golden globes from 1943
gg_drama = pd.read_csv('./data/scraping_results/gg_drama.csv')
#like osc_scrape but for golden globes from 1951
gg_comedy = pd.read_csv('./data/scraping_results/gg_comedy.csv')
#producers guild year film wiki winner from 1989
pga = pd.read_csv('./data/scraping_results/pga.csv')
#from 1960
bafta = pd.read_csv('./data/scraping_results/bafta.csv')
#directors guild from 1948
dga = pd.read_csv('./data/scraping_results/dgas.csv')
#screen actors guild from 1995
sag = pd.read_csv('./data/scraping_results/sag_ensemble.csv')

# Join on Nominations

In [3]:
# I use this order and manually fill in the nominations for this year
noms_merge = noms[['film','nominations']].drop_duplicates()
osc_scrape = pd.merge(osc_scrape, noms_merge, on = 'film', how = 'left')
osc_to_avg = osc_scrape.copy()
avg_noms = int(osc_to_avg.loc[(~osc_to_avg['nominations'].isna())]['nominations'].mean())
osc_scrape.loc[(osc_scrape['nominations'].isna()), 'nominations'] = avg_noms

## Join on the other awards shows

In [4]:
def table_assemble(main_df, to_add_df, show_name):
    """
    Given an awards show scraped from Wikipedia, this function
    Adds it as a feature as a 0/1 flag if it was
    Nominated for that Award and if it won
    """
    nom_col = str('nom_') + show_name
    win_col = str('winner_') + show_name
    # Initalize Columns as no
    main_df[nom_col] = 0
    main_df[win_col] = 0
    to_add_df.columns = ['year','film','wiki','winner_add']
    main_df = pd.merge(main_df, to_add_df[['film','winner_add']], on = 'film', how = 'left')
    main_df.loc[(~main_df['winner_add'].isna()), nom_col] = 1
    main_df.loc[(main_df['winner_add'] == True), win_col] = 1
    main_df = main_df.drop('winner_add', axis = 1)
    return main_df


In [5]:
# Adding a response columns
osc_scrape['Oscar_win'] = 0
osc_scrape.loc[(osc_scrape['winner'] == True), 'Oscar_win'] = 1

In [7]:
# Adding on the results of each festival
scraped_dfs = [gg_drama, gg_comedy, pga, bafta, dga, sag]
scraped_names = ['gg_drama','gg_comedy','pga', 'bafta', 'dga', 'sag']

for i in range(len(scraped_dfs)):
    osc_scrape = table_assemble(osc_scrape, scraped_dfs[i], scraped_names[i])

In [8]:
# Some awards didn't exist until certain years
for i in range(len(scraped_dfs)):
    print(scraped_names[i], scraped_dfs[i].year.min())

gg_drama 1943
gg_comedy 1951
pga 1989
bafta 1960
dga 1948
sag 1995


In [9]:
# Saving the data
osc_scrape.to_csv('./data/processed_results/osc_df')

# Results
The DataFrame is now Machine Learning ready, with 0's and 1's as markers for how a given Oscar-Nominated Film did for that Awards show. The response column will be Oscar_win. An example of this final DataFrame can be seen below.

In [10]:
cols = list(osc_scrape.columns)
display_cols = cols[0:2] + cols[4:]
osc_scrape[display_cols].sample(5)

Unnamed: 0,year,film,nominations,Oscar_win,nom_gg_drama,winner_gg_drama,nom_gg_comedy,winner_gg_comedy,nom_pga,winner_pga,nom_bafta,winner_bafta,nom_dga,winner_dga,nom_sag,winner_sag
328,1981,Raiders of the Lost Ark,8.0,0,0,0,0,0,0,0,1,0,1,0,0,0
539,2017,Darkest Hour,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0
433,2002,The Lord of the Rings: The Two Towers,6.0,0,1,0,0,0,1,0,1,0,1,0,1,0
417,1999,The Green Mile,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0
580,2021,West Side Story,7.0,0,0,0,1,1,0,0,0,0,0,0,0,0


In [11]:
len(osc_scrape)

591

In [12]:
display_cols

['year',
 'film',
 'nominations',
 'Oscar_win',
 'nom_gg_drama',
 'winner_gg_drama',
 'nom_gg_comedy',
 'winner_gg_comedy',
 'nom_pga',
 'winner_pga',
 'nom_bafta',
 'winner_bafta',
 'nom_dga',
 'winner_dga',
 'nom_sag',
 'winner_sag']