In [None]:
from sklearn.naive_bayes import CategoricalNB
import json
import seaborn as sns
import pandas as pd


In [None]:
with open('masterchef_data_cleaned.json', 'r') as file:
    data = json.load(file)

In [None]:
data.keys()

In [None]:
# data.keys()
US = pd.DataFrame(data['United States'])
Canada = pd.DataFrame(data['Canada'])
Brazil = pd.DataFrame(data['Brazil'])


In [None]:
df = pd.concat([US, Canada, Brazil])

In [None]:
def winner(row):
    return row['evaluations'][-1] == 'WINNER'



In [None]:
pd.DataFrame(df[(df['season'] == 1) & (df['nation'] == 'United States')]['evaluations'])
new_df = df.copy()
new_df

In [None]:
new_df['won'] = new_df.apply(winner,axis=1)

In [None]:
winners = new_df[new_df['won']]

# Function to generate season ID
def generate_season_id(row):
    country_id = {"USA": 1, "Canada": 2, "Brazil": 3}
    
    season_id = row['nation'] + str(row['season'])
    
    return season_id




new_df['seasonID'] = new_df.apply(generate_season_id, axis=1)
winners['seasonID'] = winners.apply(generate_season_id, axis=1)


In [None]:
new_df['seasonID'].unique()

In [None]:
winners['seasonID'].unique()

In [None]:
new_df

In [None]:
# get number of evaluations in each season

def num_episodes(row, evaluations):
    if row['won']:
        evaluations[row['seasonID']] = len(row['evaluations'])

evaluations = {}
for i, row in new_df.iterrows():
    num_episodes(row, evaluations)
print(evaluations)


def add_num_eval(row, evaluations):
    return evaluations[row['seasonID']]

new_df['numEval'] = new_df.apply(add_num_eval, args=(evaluations,), axis=1)

new_df = new_df[~new_df['won']]
new_df



In [None]:
new_df['age'].mean()

In [None]:
winners['age'].mean()

In [None]:
from scipy.stats import ttest_ind

ttest_ind(a=new_df['age'], b=winners['age'], equal_var=False)

In [None]:
# Some facts about the winners
import numpy as np
import matplotlib.pyplot as plt

def eval_count(df):
    a = []
    for item in df['evaluations']:
        a = np.concatenate([a, item])
        
    values = np.unique(a, return_counts=True)

    values = pd.DataFrame({'counts':values[1], 'values': values[0]})
    return values
winner_values = eval_count(winners)
all_values = eval_count(new_df)

plt.figure(figsize=(12, 5))
sns.barplot(data=winner_values, x='counts', y='values', hue='values', legend=None)
plt.title('Winner evaluations')
plt.show()
plt.figure(figsize=(12, 5))
sns.barplot(data=all_values, x='counts', y='values', hue='values', legend=None)
plt.title('All other evaluations')

In [None]:
barplot_data=(winner_values['counts'] + all_values['counts']).reset_index().fillna(0)

In [None]:
all_values

In [None]:
def proportion(row, total_no_elim):
    if row['values'] in ['ELIM', 'WINNER', 'RUNNER_UP(S)']:
        return 0
    return row['counts']/total_no_elim

win_total = winner_values['counts'].sum() - winner_values[winner_values['values'].isin(['ELIM', 'WINNER', 'RUNNER_UP(S)'])]['counts'].sum()
all_total = all_values['counts'].sum() - all_values[all_values['values'].isin(['ELIM', 'WINNER', 'RUNNER_UP(S)'])]['counts'].sum()


all_values['proportion'] = all_values.apply(proportion, args = (all_total,), axis=1)
winner_values['proportion'] = winner_values.apply(proportion, args = (win_total,), axis=1)




In [None]:
winner_values
# doesn't have runner up, wdr, or ban
winner_values = pd.concat([winner_values, pd.DataFrame.from_dict({'counts':[0, 0, 0], 'values':['BAN', 'RUNNER_UP(S)', 'WDR'], 'proportion':[0,0,0]})])

In [None]:
winner_values

In [None]:
winner_values.sort_values(by='values')['proportion'] - all_values.sort_values(by='values')['proportion']
combined = pd.merge(winner_values, all_values, on='values', suffixes=['_win', '_all'])
combined['prop_diff'] = combined['proportion_win'] - combined['proportion_all']

In [None]:
def prop_ratio_signed(row):
    if row['prop_diff'] == 0:
        return 0;
    else:
        return row['proportion_all']/row['prop_diff']
    
combined['prop_ratio'] = 1-(combined['proportion_all']/combined['proportion_win'])

In [None]:
combined

In [None]:
combined['total'] = combined['counts_all'] + combined['counts_win']

In [None]:
sns.barplot(data=combined, x='total', y='values', hue = 'values')
plt.title('All value counts')

In [None]:
sns.barplot(data=combined, x='prop_ratio', y='values', hue='values', legend=None)
plt.text(0.5, -.18, "<-Non-winners had more | Winners had more ->", ha='center', fontsize=10, transform=plt.gca().transAxes)
plt.title('Winner vs all contestants proportion differences')

The clearest indication of a winner is definitely performance in individual challenges. Winners had individual wins as 18% of their evaluations, while the average competitor (including winners) only had 10% of their evaluations as individual wins. This means winners were nearly twice as likely as all competitors to win challenges.

Interestingly, team performance was much less different. If anything, they performed slightly worse than agerage, with a slightly lower proportion of wins.

high_i_w
0.123167   0.091639

win i w
0.180352   0.100248

We want to predict who will win on after each new evaluation. We can do this a few ways:

1. Train a model for each unique number of people.
2. 

In [None]:
winners.shape

In [None]:
new_df.shape

In [None]:
new_df

In [None]:
def num_ind(row):
    ind = [i for i in row['evaluations'] if i[-1] == 'I']
    return ind

winners['ind'] = winners.apply(num_ind, axis=1)

def win_prop(row):
    return len([i for i in row['ind'] if i == 'WIN_I'])/len(row['ind'])

winners['win_prop'] = winners.apply(win_prop, axis=1)

In [None]:
winners = winners.reset_index()
k = winners['ind'].apply(len)

In [None]:
winners['win_prop']

In [None]:
winners['win_prop'].mean()

# index 14 won 6/11
# index 27 won 8/16

In [None]:
winners.loc[[14, 27]]

In [None]:
def imm_counts(row):
    return len([i for i in row['evaluations'] if i[0:3] == "IMM"])

immunity_counts = winners.apply(imm_counts, axis=1)

In [None]:
def eval_len(row):
    return len(row['evaluations'])
eval_length = winners.apply(eval_len, axis=1)

In [None]:
immunity_counts

In [None]:
eval_length

In [None]:
from statistics import mean
win_imm_prop = immunity_counts/eval_length
mean(win_imm_prop)
# win_imm_prop

In [None]:
winners

In [None]:
all_imm_counts = new_df.apply(imm_counts, axis=1)

all_eval_counts = new_df.apply(eval_len, axis=1)

all_imm_prop = all_imm_counts/all_eval_counts


mean(all_imm_prop)
# all_imm_counts/all_eval_counts


In [None]:
ttest_ind(all_imm_prop, win_imm_prop, equal_var=False)