In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import json

In [96]:
def get_legend(leg_table: list, season: int) -> dict:
    legend_dict = {}    
    if season < 5:
        leg_rows = leg_table.find_all('tr')

        for row in leg_rows:
            legend_items = row.find_all('td')
            for item in legend_items:
                # print(item)
                color = str(item).split('background-color:')[1].split(';')[0]
                desc = item.text.strip()
                legend_dict[color] = desc
    
    else:
        leg_cols = leg_table.find_all('div', 'plainlist')
        for col in leg_cols:
            legend_items = col.find_all('li')
            for item in legend_items:
                color = str(item).split('background-color:')[1].split(';')[0]
                desc = item.text.strip()
                legend_dict[color] = desc
    # legend_items = legend[0].find_all('dd')

    # for item in legend_items:
    #     color = str(item).split('background-color:')[1].split(';')[0]
    #     phrase = item.text.split('(')[1].split(')')
    #     key = color + '-' + phrase[0].strip()
    #     desc = phrase[1]
    #     legend_dict[key] = desc.strip()
    return legend_dict

In [123]:
def get_names(people_table: list) -> list:
    name_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        elif 'Returned' in cells[0].text:
            continue
        name = cells[0].text.replace('[', '').replace(']', '').split('Returned')[0]
        name = re.sub('\d', '', name)
        name_list.append(name.strip())
    return name_list

In [81]:
def get_homes(people_table: list) -> list:
    home_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        home = cells[2].text.strip()
        home_list.append(home)
    return home_list

In [82]:
def get_ages(people_table: list) -> list:
    age_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        # print(cells[1].text.strip())
        age = int(cells[1].text.strip().split('-')[0])
        age_list.append(age)
    return age_list

In [83]:
def get_occupations(people_table: list) -> list:
    occ_list = []
    rows = people_table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        if people_table.find_all("tr")[0].find_all("th")[3].text.strip() == 'Previous Occupation':
            occ = cells[4].text.strip()
        else:
            occ = cells[3].text.strip()
        occ_list.append(occ)
    return occ_list

In [127]:
def get_evaluations(evaluation_table: list) -> list:
    rows = evaluation_table.find_all('tr')
    rowspan_outer = []
    hist_list_outer = []
    for row in rows:
        hist_list = []
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        #print(cells)
        is_chef = False
        for name in name_list:
            name_split = name.replace('"', '').replace('Returned', ' ').split(' ')
            # print(name_split[0])
            # print(cells)
            # print(cells[0].text)
            test_name = cells[0].text.strip().split(' ')[0]
            # print(test_name)
            if cells[0].text.strip().split(' ')[0] in name_split:    # master chef usa (name)
                is_chef = True
                break
            if test_name in name_split:     # masterchef canada(name)
                is_chef = True
                break
        if not is_chef:
            # print('Not chef')
            continue
        # c_name = cells[0].text.strip()      # masterchef usa
        c_name = test_name    # masterchef canada
        
        for i, eval in enumerate(cells[1:]):   # usa start from 1, canada start from 0
            # print(i, ' ', eval)
            if eval.text.strip() == 'IN':
                color = 'white'
            elif 'darkgrey' in str(eval) or 'CCCCCC' in str(eval) or 'A9A9A9' in str(eval):
                continue
            else:
                color = str(eval).lower().split('bgcolor="')[1].split('"')[0]
            phrase = eval.text.strip()
            key = color + '-' + phrase 
            hist_list.append(key)
            
            if len(rowspan_outer) > 0:
                #print(i, ' ', eval)
                for j, span in enumerate(rowspan_outer):
                    #print(j, ' ', span)
                    if i+1 == span[0]:
                        #print('MATCH')
                        key = span[1]
                        span[2] -= 1
                        hist_list.append(key)
                        #print(rowspan_outer)
                        if span[2] == 0:
                            rowspan_outer.pop(j)
                        
                        #print(rowspan_outer)
            if 'rowspan="' in str(eval):
                span = [i, key, int(str(eval).split('rowspan="')[1].split('"')[0].strip())-1]
                rowspan_outer.append(span)
        # print(c_name + ': ', hist_list)
        hist_list_outer.append(hist_list)
    return hist_list_outer
    

In [125]:
r = requests.get(f'https://en.wikipedia.org/wiki/MasterChef_(Brazilian_season_1)')    # 7 seasons total
soup = BeautifulSoup(r.text, 'html.parser')
tables = soup.find_all('table', 'wikitable')
print(len(tables))
if len(tables) == 5:
    tables.pop(0)

# print(tables[2])
season = 1
if season < 5:
    legend_dict = get_legend(tables[2], season)
else:
    leg_tables = soup.find_all('table', 'multicol')
    # print(leg_tables)
    legend_dict = get_legend(leg_tables[0], season)
# legend_dict = get_legend(tables[2], 5)
# print(legend_dict)
name_list = get_names(tables[0])
# print(name_list)
age_list = get_ages(tables[0])
# print(age_list)
home_list = get_homes(tables[0])
# print(home_list)
occ_list = get_occupations(tables[0])
# print(occ_list)
hist_list = get_evaluations(tables[1])
# print(hist_list)
print(len(hist_list))
print(name_list)

5
Elisa:  ['white-IN', '6495ed-HIGH', '959ffd-WIN', 'white-IN', '6495ed-HIGH', '959ffd-WIN', 'ff35f3-LOW', 'ffa500-LOW', '6495ed-HIGH', 'white-IN', '6495ed-HIGH', 'ff35f3-LOW', '32cd32-WIN', '32cd32-WIN', 'white-IN', '6495ed-HIGH', '32cd32-WIN', 'white-IN', '32cd32-WIN', 'white-IN', '32cd32-WIN', '32cd32-WIN', 'ffd700-WINNER']
Helena:  ['white-IN', 'white-IN', '959ffd-WIN', 'ff35f3-LOW', 'white-IN', '959ffd-WIN', 'white-IN', '6495ed-HIGH', '32cd32-WIN', '6495ed-HIGH', 'ffc0cb-LOW', '6495ed-HIGH', 'white-IN', '959ffd-WIN', 'white-IN', 'ffa500-LOW', '959ffd-WIN', '6495ed-HIGH', 'ffa500-LOW', '32cd32-WIN', 'ffff00-IMM', 'ffa500-LOW']
Luis:  ['white-IN', '6495ed-HIGH', '32cd32-WIN', 'white-IN', 'white-IN', 'ffe4e1-PT', 'white-IN', '6495ed-HIGH', 'ffc0cb-LOW', 'ffc0cb-LOW', 'ffa500-LOW', '32cd32-WIN', 'ffff00-IMM', 'ffa500-LOW', 'white-IN', '32cd32-WIN', '959ffd-WIN', 'white-IN', 'ffc0cb-LOW', '6495ed-HIGH', 'ffa500-LOW', 'dc143c-ELIM']
Mohamad:  ['ff35f3-LOW', 'white-IN', '959ffd-WIN', 'ff

In [128]:
leg_list = []
dta_list = []
for season in range(1, 11):
    print(season)
    if season == 7:       # Skip season 7, it's wild man.
        continue
    r = requests.get(f'https://en.wikipedia.org/wiki/MasterChef_(Brazilian_season_{season})')    # 11 seasons total
    soup = BeautifulSoup(r.text, 'html.parser')
    tables = soup.find_all('table', 'wikitable')
    if len(tables) == 5:
        tables.pop(0)
    name_list = get_names(tables[0])
    age_list = get_ages(tables[0])
    home_list = get_homes(tables[0])
    occ_list = get_occupations(tables[0])
    hist_list = get_evaluations(tables[1])
    
    for i in range(len(name_list)):
        dta_list.append({'season':season, 'name': name_list[i], 'age': age_list[i], 'home':home_list[i], 'occupation':occ_list[i], 'evaluations':hist_list[i]})
    
    if season < 5:
        legend_dict = get_legend(tables[2], season)
    else:
        leg_tables = soup.find_all('table', 'multicol')
        legend_dict = get_legend(leg_tables[0], season)
    leg_list.append({'season':season, 'legend':legend_dict})
        
# dta_list

1
2
3
4
5
6
7
8
9
10


In [129]:
with open("masterchef_brazil.json", "w") as fp:
    json.dump(dta_list, fp)

In [130]:
with open("masterchef_brazil_legend.json", "w") as fp:
    json.dump(leg_list, fp)