In [4]:
import pandas as pd
import numpy as np
from storage import get_relevant_df
from utils import get_top_critics_df
from termcolor import colored
from IPython.display import display, Markdown
from sklearn.metrics import classification_report, roc_auc_score
from sklearn import preprocessing

In [None]:
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
# Using cufflinks in offline mode
import cufflinks
cufflinks.go_offline()
# Set the global theme for cufflinks
cufflinks.set_config_file(world_readable=True, theme='pearl', offline=True)

def disp(x):
    display(Markdown(x))

In [2]:
df = get_relevant_df()
df = df[df.year >= 2000]
top_df = get_top_critics_df(df)

# Normalize critic scores

In [18]:
mean_df = top_df.groupby('reviewer_url')['score'].agg(['mean']).reset_index().sort_values(by='mean')

In [19]:
top_df = pd.merge(top_df, mean_df, on='reviewer_url')

In [20]:
top_df['score'] = top_df.apply(lambda x: x['score'] * 0.5 / x['mean'], axis=1)

In [21]:
top_df.groupby(['winner', 'title'])['score'].mean().reset_index().corr()

Unnamed: 0,winner,score
winner,1.0,0.267805
score,0.267805,1.0


# Normalizing helped to increase correlation

# Predict by average score

In [74]:
ydf = top_df.groupby(['year', 'winner', 'title'])['score'].mean().reset_index()

In [75]:
winner_max = ydf[ydf.winner == True].groupby('year')['score'].max().reset_index(name='winner_max')
loser_max = ydf[ydf.winner == False].groupby('year')['score'].max().reset_index(name='loser_max')

In [76]:
mx_df = pd.merge(winner_max, loser_max)
mx_df

Unnamed: 0,year,winner_max,loser_max
0,2000,0.543341,0.520931
1,2001,0.496348,0.567802
2,2002,0.488229,0.532716
3,2003,0.534963,0.527886
4,2004,0.546634,0.540654
5,2005,0.539262,0.533522
6,2006,0.467866,0.524034
7,2007,0.513114,0.52905
8,2008,0.546472,0.510667
9,2009,0.514233,0.521251


In [77]:
len(mx_df[mx_df.winner_max > mx_df.loser_max]) / len(mx_df)

0.47368421052631576

In [40]:
df[df.winner == True]['title'].nunique() / df['title'].nunique()

0.1450381679389313

# <font color=red> After normalizing and taking top critics I can predict winner with 47% probability just by taking average critic score </font>

In [11]:
ss_df = top_df.groupby(['title', 'winner'])['rt_tomato_score', 'rt_audience_score'].agg(['max']).reset_index()

In [12]:
ss_df.head()

Unnamed: 0_level_0,title,winner,rt_tomato_score,rt_audience_score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max,max
0,12 Years a Slave,True,95.0,90.0
1,127 Hours,False,93.0,85.0
2,A Beautiful Mind,True,75.0,93.0
3,A Serious Man,False,89.0,68.0
4,American Beauty,True,88.0,93.0


In [14]:
ss_df[['winner', 'rt_tomato_score']].corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,winner,rt_tomato_score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max
winner,,1.0,0.091842
rt_tomato_score,max,0.091842,1.0


In [15]:
ss_df[['winner', 'rt_audience_score']].corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,winner,rt_audience_score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max
winner,,1.0,0.204685
rt_audience_score,max,0.204685,1.0


# <font color=red> There's quite a big correlation between audience score and oscar winning

In [17]:
ss_df[['rt_tomato_score', 'rt_audience_score']].corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,rt_tomato_score,rt_audience_score
Unnamed: 0_level_1,Unnamed: 1_level_1,max,max
rt_tomato_score,max,1.0,0.243544
rt_audience_score,max,0.243544,1.0


In [23]:
top_df['ac_score'] = top_df.apply(lambda x: x['score'] + 0.6 * x['rt_audience_score'], axis=1)

In [28]:
mean_score_df = top_df.groupby(['year', 'winner', 'title'])['score', 'rt_audience_score'].mean().reset_index()

In [96]:
mean_score_df['ac_score'] = mean_score_df.apply(lambda x: x['score'] + 0.0033 * x['rt_audience_score'], axis=1)

In [97]:
mean_score_df[['winner', 'ac_score']].corr()

Unnamed: 0,winner,ac_score
winner,1.0,0.31819
ac_score,0.31819,1.0


In [87]:
mean_score_df

Unnamed: 0,year,winner,title,score,rt_audience_score,ac_score
0,2000,False,The Cider House Rules,0.497955,77.0,0.805955
1,2000,False,The Green Mile,0.483414,94.0,0.859414
2,2000,False,The Insider,0.520931,90.0,0.880931
3,2000,False,The Sixth Sense,0.456385,90.0,0.816385
4,2000,True,American Beauty,0.543341,93.0,0.915341
5,2001,False,Chocolat,0.417064,83.0,0.749064
6,2001,False,"Crouching Tiger, Hidden Dragon",0.567802,86.0,0.911802
7,2001,False,Erin Brockovich,0.465824,81.0,0.789824
8,2001,False,Traffic,0.526506,85.0,0.866506
9,2001,True,Gladiator,0.496348,87.0,0.844348


In [83]:
ac_ydf = mean_score_df.groupby(['year', 'winner', 'title'])['ac_score'].mean().reset_index()

In [84]:
winner_max = ac_ydf[ac_ydf.winner == True].groupby('year')['ac_score'].max().reset_index(name='winner_max')
loser_max = ac_ydf[ac_ydf.winner == False].groupby('year')['ac_score'].max().reset_index(name='loser_max')

In [85]:
mx_df = pd.merge(winner_max, loser_max)
mx_df

Unnamed: 0,year,winner_max,loser_max
0,2000,0.915341,0.880931
1,2001,0.844348,0.911802
2,2002,0.860229,0.881332
3,2003,0.866963,0.907886
4,2004,0.890634,0.880654
5,2005,0.899262,0.845522
6,2006,0.819866,0.848034
7,2007,0.889114,0.87305
8,2008,0.890472,0.862667
9,2009,0.874233,0.877251


In [86]:
len(mx_df[mx_df.winner_max > mx_df.loser_max]) / len(mx_df)

0.42105263157894735

In [3]:
top_df

Unnamed: 0,winner,title,reviewer_url,fresh,original_score,rt_tomato_score,rt_audience_score,year,score,mean
2091,True,The Shape of Water,/critic/john-beifuss/,True,0.875,92.0,73.0,2018,0.238472,0.834596
2092,False,Dunkirk,/critic/john-beifuss/,True,0.875,92.0,81.0,2018,0.238472,0.834596
2093,False,Lady Bird,/critic/john-beifuss/,True,1.000,99.0,79.0,2018,0.272540,0.834596
2094,False,Get Out,/critic/john-beifuss/,True,1.000,98.0,86.0,2018,0.272540,0.834596
2095,True,The Lord of the Rings: The Return of the King,/critic/john-beifuss/,True,0.875,93.0,86.0,2004,0.238472,0.834596
2096,False,Lost in Translation,/critic/john-beifuss/,True,0.750,95.0,85.0,2004,0.204405,0.834596
2097,False,Mystic River,/critic/john-beifuss/,True,0.875,88.0,89.0,2004,0.238472,0.834596
2098,True,Million Dollar Baby,/critic/john-beifuss/,True,1.000,91.0,90.0,2005,0.272540,0.834596
2099,False,The Aviator,/critic/john-beifuss/,True,0.750,86.0,79.0,2005,0.204405,0.834596
2100,False,Sideways,/critic/john-beifuss/,True,0.875,96.0,78.0,2005,0.238472,0.834596


In [9]:
def trans(x):
    return preprocessing.normalize([x['score']])[0]

In [10]:
top_df.groupby('reviewer_url').apply(trans)

reviewer_url
/critic/brian-orndorf/                    [0.09859577477734466, 0.09859577477734466, 0.0...
/critic/david-edelstein/                  [0.0, 0.105999788000636, 0.0, 0.10599978800063...
/critic/dennis-schwartz/                  [0.09535700599967267, 0.0866881872724297, 0.10...
/critic/dustin-putman/                    [0.11223368936410152, 0.12826707355897315, 0.0...
/critic/eric-d-snider/                    [0.09724914439245035, 0.09724914439245035, 0.0...
/critic/frank-swietek/                    [0.10124574941356598, 0.09280860362910215, 0.0...
/critic/frederic-and-mary-ann-brussat/    [0.10608318351713639, 0.08486654681370912, 0.0...
/critic/harvey-s-karten/                  [0.09021697036716231, 0.09923866740387853, 0.0...
/critic/james-berardinelli/               [0.09166199015381171, 0.07856742013183861, 0.0...
/critic/james-kendrick/                   [0.10416851207190303, 0.08928729606163117, 0.1...
/critic/jeffrey-m-anderson/               [0.10354500232014505, 0.0

In [50]:
top_df['normalized_score'] = top_df.groupby('reviewer_url')['score'].transform(preprocessing.scale)

In [53]:
top_df.groupby('reviewer_url')['normalized_score'].std()

reviewer_url
/critic/brian-orndorf/                    1.004619
/critic/david-edelstein/                  1.004706
/critic/dennis-schwartz/                  1.004193
/critic/dustin-putman/                    1.004662
/critic/eric-d-snider/                    1.004662
/critic/frank-swietek/                    1.003960
/critic/frederic-and-mary-ann-brussat/    1.004577
/critic/harvey-s-karten/                  1.004796
/critic/james-berardinelli/               1.003929
/critic/james-kendrick/                   1.004938
/critic/jeffrey-m-anderson/               1.004796
/critic/john-beifuss/                     1.005089
/critic/josh-larsen/                      1.004494
/critic/laura-clifford/                   1.004338
/critic/mark-dujsik/                      1.004662
/critic/maryann-johanson/                 1.004662
/critic/matt-brunson/                     1.004890
/critic/matthew-turner/                   1.004843
/critic/mick-lasalle/                     1.004988
/critic/mike-mcgra

In [47]:
preprocessing.scale(np.array([1, 1, 1]))


Data with input dtype int64 was converted to float64 by the scale function.



array([0., 0., 0.])

In [56]:
top_df.groupby(['winner', 'title'], as_index=False)['normalized_score'].mean().corr()

Unnamed: 0,winner,normalized_score
winner,1.0,0.283294
normalized_score,0.283294,1.0
