In [108]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import json

In [92]:
def get_legend(soup: list) -> dict:
    legend = soup.find_all('dl')
    legend_items = legend[0].find_all('dd')
    legend_dict = {}
    for item in legend_items:
        color = str(item).split('background-color:')[1].split(';')[0]
        phrase = item.text.split('(')[1].split(')')
        key = color + '-' + phrase[0].strip()
        desc = phrase[1]
        legend_dict[key] = desc.strip()
    return legend_dict

In [93]:
def get_names(people_table: list) -> list:
    name_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        name = cells[0].text.replace('[', '').replace(']', '')
        name = re.sub('\d', '', name)
        name_list.append(name.strip())
    return name_list

In [102]:
def get_homes(people_table: list) -> list:
    home_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        home = cells[2].text.strip()
        home_list.append(home)
    return home_list

In [100]:
def get_ages(people_table: list) -> list:
    age_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        age = int(cells[1].text.strip())
        age_list.append(age)
    return age_list

In [104]:
def get_occupations(people_table: list) -> list:
    occ_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        occ = cells[3].text.strip()
        occ_list.append(occ)
    return occ_list

In [198]:
def get_evaluations(evaluation_table: list) -> list:
    rows = evaluation_table.find_all('tr')
    rowspan_outer = []
    hist_list_outer = []
    for row in rows:
        hist_list = []
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        #print(cells)
        is_chef = False
        for name in name_list:
            name_split = name.replace('"', '').split(' ')
            #print(name_split[0])
            #print(cells[0].text)
            if cells[0].text.strip().split(' ')[0] in name_split:
                is_chef = True
                break
        if not is_chef:
            continue
        c_name = cells[0].text.strip()
        for i, eval in enumerate(cells[1:]):
            # print(i, ' ', eval)
            if eval.text.strip() == 'IN':
                color = 'white'
            elif 'darkgrey' in str(eval):
                continue
            elif ';' in str(eval):
                # print(str(eval).lower().split('background:'))
                color = str(eval).lower().split('background:')[1].split(';')[0]
            else:
                color = str(eval).lower().split('background:')[1].split('"')[0]
            phrase = eval.text.strip()
            key = color + '-' + phrase 
            hist_list.append(key)
            
            if len(rowspan_outer) > 0:
                #print(i, ' ', eval)
                for j, span in enumerate(rowspan_outer):
                    #print(j, ' ', span)
                    if i+1 == span[0]:
                        #print('MATCH')
                        key = span[1]
                        span[2] -= 1
                        hist_list.append(key)
                        #print(rowspan_outer)
                        if span[2] == 0:
                            rowspan_outer.pop(j)
                        
                        #print(rowspan_outer)
            if 'rowspan="' in str(eval):
                span = [i, key, int(str(eval).split('rowspan="')[1].split('"')[0].strip())-1]
                rowspan_outer.append(span)
        # print(c_name + ': ', hist_list)
        hist_list_outer.append(hist_list)
    return hist_list_outer
    

In [202]:
leg_list = []
dta_list = []
for season in range(1, 14):
    r = requests.get(f'https://en.wikipedia.org/wiki/MasterChef_(American_season_{season})')
    soup = BeautifulSoup(r.text, 'html.parser')
    tables = soup.find_all('table', 'wikitable')
    name_list = get_names(tables[0])
    age_list = get_ages(tables[0])
    home_list = get_homes(tables[0])
    occ_list = get_occupations(tables[0])
    hist_list = get_evaluations(tables[1])
    print(season)
    for i in range(len(name_list)):
        dta_list.append({'season':season, 'name': name_list[i], 'age': age_list[i], 'home':home_list[i], 'occupation':occ_list[i], 'evaluations':hist_list[i]})
    legend_dict = get_legend(soup=soup)
    leg_list.append({'season':season, 'legend':legend_dict})
        
# dta_list

1
2
3
4
5
6
7
8
9
10
11
12
13


In [200]:
with open("masterchef_us.json", "w") as fp:
    json.dump(dta_list, fp)

In [203]:
with open("masterchef_us_legends.json", "w") as fp:
    json.dump(leg_list, fp)