In [75]:
import pandas as pd
import numpy as np
from storage import get_relevant_df
from termcolor import colored
from IPython.display import display, Markdown
from sklearn.metrics import classification_report, roc_auc_score

In [44]:
relevant_df = get_relevant_df()

In [61]:
def compute_add_score(rows):
    fresh_score, rotten_score = 1, 0
    scored = rows[rows.original_score.notnull()]
    if len(scored) > 0:
        fresh = scored[scored.fresh == True]
        if len(fresh) > 0:
            fresh_score = fresh['original_score'].mean()
        rotten = scored[scored.fresh == False]
        if len(rotten) > 0:
            rotten_score = rotten['original_score'].mean()
    return pd.Series([fresh_score, rotten_score])

In [64]:
critic_score_df = relevant_df.groupby('reviewer_url', as_index=True).apply(compute_add_score).reset_index()

In [66]:
critic_score_df.columns = ['reviewer_url', 'fresh_score', 'rotten_score']

In [67]:
joined_df = pd.merge(relevant_df, critic_score_df, on='reviewer_url')

In [99]:
def choose_score(x):
    if np.isnan(x['original_score']):
        if x['fresh']:
            return x['fresh_score']
        else:
            return x['rotten_score']
    return x['original_score']
   

In [100]:
joined_df['score'] = joined_df.apply(choose_score, axis=1)

In [108]:
joined_df.drop(['fresh_score', 'rotten_score'], axis=1)

Unnamed: 0,winner,title,reviewer_url,fresh,original_score,year,score
0,True,The Shape of Water,/critic/ruben-peralta-rigaud/,True,,2018,1.000000
1,False,The Post,/critic/ruben-peralta-rigaud/,True,,2018,1.000000
2,False,Lady Bird,/critic/ruben-peralta-rigaud/,True,,2018,1.000000
3,False,"Three Billboards Outside Ebbing, Missouri",/critic/ruben-peralta-rigaud/,True,,2018,1.000000
4,False,Get Out,/critic/ruben-peralta-rigaud/,True,,2018,1.000000
5,True,The Shape of Water,/critic/sebastian-zavala-kahn/,True,1.000000,2018,1.000000
6,False,Darkest Hour,/critic/sebastian-zavala-kahn/,True,0.750000,2018,0.750000
7,False,The Post,/critic/sebastian-zavala-kahn/,True,0.875000,2018,0.875000
8,False,Dunkirk,/critic/sebastian-zavala-kahn/,True,1.000000,2018,1.000000
9,False,Lady Bird,/critic/sebastian-zavala-kahn/,True,0.875000,2018,0.875000


# Examining how many reviews have no score

In [10]:
relevant_df[relevant_df.reviewer_url == '/critic/kenneth-turan/'][relevant_df.fresh == False][relevant_df.original_score.notnull()]['original_score'].mean()

  """Entry point for launching an IPython kernel.


0.44000000000000006

In [45]:
def has_score(x):
    null = len(x[x.original_score.isnull()])
    return pd.Series([len(x) - null, null])

In [46]:
ddf = relevant_df.groupby('reviewer_url', as_index=True).apply(has_score).reset_index()

In [47]:
ddf.columns = ['critic', 'nonnull', 'null']

In [48]:
ddf[ddf.nonnull == 0]['null'].sum()

3343

In [49]:
len(relevant_df[relevant_df.original_score.isnull()])

7000

In [50]:
len(relevant_df)

27400

In [52]:
ddf[ddf.nonnull == 0].sort_values(by='null', ascending=False)

Unnamed: 0,critic,nonnull,null
526,/critic/david-edelstein/,0,107
2089,/critic/todd-mccarthy/,0,86
1030,/critic/joe-morgenstern/,0,86
1335,/critic/louise-keller/,0,83
485,/critic/dana-stevens/,0,76
125,/critic/andrew-l-urban/,0,67
952,/critic/jeanne-kaplan/,0,66
542,/critic/david-kaplan/,0,66
1198,/critic/kelly-vance/,0,60
2150,/critic/victoria-alexander/,0,59


In [54]:
cdf = relevant_df[relevant_df.reviewer_url == '/critic/david-edelstein/']

In [55]:
roc_auc_score(cdf['winner'], cdf['fresh'])

0.5202898550724637