In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv("mc_ratings.csv")

In [3]:
ratings

Unnamed: 0,critic_url,movie_url,rating
0,/critic/joe-morgenstern,/movie/hard-powder,30
1,/critic/joe-morgenstern,/movie/the-lego-movie-2-the-second-part,80
2,/critic/joe-morgenstern,/movie/never-look-away,90
3,/critic/joe-morgenstern,/movie/daughter-of-mine,90
4,/critic/joe-morgenstern,/movie/arctic,70
5,/critic/joe-morgenstern,/movie/miss-bala-2019,20
6,/critic/joe-morgenstern,/movie/the-invisibles,70
7,/critic/joe-morgenstern,/movie/the-kid-who-would-be-king,50
8,/critic/joe-morgenstern,/movie/glass,40
9,/critic/joe-morgenstern,/movie/the-upside,40


In [30]:
def create_ratings_map(ratings, movie_key, rating_key):
    avg_ratings_map = dict.fromkeys(ratings[movie_key].values, (0, 0))
    for review in ratings.iterrows():
        movie_name = review[1][movie_key]
        movie_rating = review[1][rating_key]
        old_value = avg_ratings_map[movie_name]
        new_value = (old_value[0] + 1, old_value[1] + int(movie_rating))
        avg_ratings_map[movie_name] = new_value
    for movie in avg_ratings_map:
        avg_ratings_map[movie] = avg_ratings_map[movie][1] / avg_ratings_map[movie][0]
        
    return avg_ratings_map

In [39]:
def calculate_rmse(ratings, avg_ratings_map, movie_key, ratings_key, num_entries):
    error = 0
    for review in ratings.iterrows():
        movie_name = review[1][movie_key]
        movie_rating = review[1][ratings_key]
        diff = float(movie_rating) - avg_ratings_map[movie_name]
        error += diff ** 2
    mean = error / num_entries
    rmse = mean ** 0.5
    return rmse

In [25]:
from fastai.collab import *

In [28]:
path = untar_data(URLs.ML_SAMPLE)
movielens_ratings = pd.read_csv(path/'ratings.csv')

In [32]:
avg_ratings_map = create_ratings_map(movielens_ratings, 'movieId', 'rating')

In [41]:
rmse = calculate_rmse(movielens_ratings, avg_ratings_map, 'movieId', 'rating', 6031)
print(rmse)

0.8812529892360426


In [44]:
binary_ratings = [1 if rating >= 50 else 0 for rating in ratings['rating'].values]
ratings['binary_ratings'] = pd.DataFrame(binary_ratings)

ratings

Unnamed: 0,critic_url,movie_url,rating,binary_ratings
0,/critic/joe-morgenstern,/movie/hard-powder,30,0
1,/critic/joe-morgenstern,/movie/the-lego-movie-2-the-second-part,80,1
2,/critic/joe-morgenstern,/movie/never-look-away,90,1
3,/critic/joe-morgenstern,/movie/daughter-of-mine,90,1
4,/critic/joe-morgenstern,/movie/arctic,70,1
5,/critic/joe-morgenstern,/movie/miss-bala-2019,20,0
6,/critic/joe-morgenstern,/movie/the-invisibles,70,1
7,/critic/joe-morgenstern,/movie/the-kid-who-would-be-king,50,1
8,/critic/joe-morgenstern,/movie/glass,40,0
9,/critic/joe-morgenstern,/movie/the-upside,40,0


In [45]:
avg_ratings_map = create_ratings_map(ratings, 'movie_url', 'binary_ratings')
rmse = calculate_rmse(ratings, avg_ratings_map, 'movie_url', 'binary_ratings', 207533)
print(rmse)

0.3222504971139414


In [48]:
!ls

Untitled.ipynb mc_critics.csv mc_ratings.csv


In [49]:
rt_reviews = pd.read_csv("../Rotten Tomatoes/reviews.csv")

In [51]:
rt_binary_ratings = [1 if rating == "fresh" else 0 for rating in rt_reviews['rt_score'].values]
rt_reviews['binary_ratings'] = pd.DataFrame(rt_binary_ratings)

In [53]:
avg_ratings_map = create_ratings_map(rt_reviews, 'movie_id', 'binary_ratings')
rmse = calculate_rmse(rt_reviews, avg_ratings_map, 'movie_id', 'binary_ratings', 658482)
print(rmse)

0.3870559372913993


In [54]:
from sklearn.metrics import *

In [56]:
preds = [avg_ratings_map[movie_name] for movie_name in rt_reviews['movie_id'].values]

In [59]:
preds = [1 if val >= 0.5 else 0 for val in preds]

In [61]:
print(f1_score(rt_reviews['binary_ratings'].values, preds))

0.8377027781710198


In [63]:
print (accuracy_score(rt_reviews['binary_ratings'].values, preds))

0.778889931691375
