## The Movie Recommendation System Based on Sentiment Analysis

### 1. Preprocessing

In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm;

In [19]:
# Load data
df = pd.read_csv("rotten_tomatoes/rotten_tomatoes_critic_reviews.csv")

In [20]:
# Remove rows with na values
df_no_blanks = df.dropna()
# Remove duplicated rows
df_no_blanks = df_no_blanks[~df_no_blanks.duplicated()]

In [21]:
# Change ratings to floats for later evaluation
def computeScore(score_str):
    items = score_str.split('/')
    return float(items[0])/float(items[1])

df_no_blanks = df_no_blanks[df_no_blanks['review_score'].str.contains('/')]
df_no_blanks = df_no_blanks[~df_no_blanks['review_score'].str.contains('/0')] # The division should not be 0
df_no_blanks['review_score'] = df_no_blanks['review_score'].apply(computeScore)

In [22]:
# Filter out popular movies: films more than 100 reviews
popular_movies = df_no_blanks.groupby('rotten_tomatoes_link').count()['critic_name']>=100
df_popular_movies = df_no_blanks[df_no_blanks['rotten_tomatoes_link'].isin(popular_movies[popular_movies].index)];

In [23]:
print("How many popular movies?", df_popular_movies['rotten_tomatoes_link'].unique().size)

How many popular movies? 1567


In [24]:
df_group = df_popular_movies.groupby("critic_name")
# Count the number of reviews for each reviewer (Series)
s = df_group.aggregate("critic_name").count()
# Change the Series into DataFrame
df_total_reviews = pd.DataFrame({"critic_name":s.index, "total_reviews":s.values})
df_total_reviews

Unnamed: 0,critic_name,total_reviews
0,A.A. Dowd,21
1,A.O. Scott,351
2,A.S. Hamrah,3
3,AJ Caulfield,12
4,AP Kryza,4
...,...,...
4001,Zena Dixon,4
4002,Zoe Margolis,12
4003,Zoe Rose Smith,6
4004,Zosia Bielski,1


In [25]:
# Filter out top criticsz: remove critics with fewer than 300 reviews
df_over_300 = df_total_reviews[df_total_reviews['total_reviews']>300]
df_over_300

Unnamed: 0,critic_name,total_reviews
1,A.O. Scott,351
142,Alex Zane,307
177,Alistair Harkness,578
179,Allan Hunter,502
181,Allen Adams,315
...,...,...
3949,Wendy Ide,313
3952,Wesley Lovell,524
3953,Wesley Morris,329
3971,Willie Waffle,852


In [26]:
critics = pd.Series(df_over_300['critic_name'])

In [27]:
critics = critics.sample(100, random_state=100) # mark
sample_critics = df_popular_movies[df_popular_movies['critic_name'].isin(critics)]
sample_critics['review_type'] = sample_critics['review_type'].map({'Fresh': 1, 'Rotten': -1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_critics['review_type'] = sample_critics['review_type'].map({'Fresh': 1, 'Rotten': -1})


In [28]:
# Define the test set
test_set = {}

for critic in critics:
    critic_df = sample_critics.groupby('critic_name').get_group(critic)
    test_samples = critic_df[critic_df['review_type']==1].sample(n=int(critic_df.shape[0]*0.2), random_state=100)
    test_set[critic] = test_samples['rotten_tomatoes_link'].unique()

In [29]:
# Remove test samples from the original data frame
remove_ids = []

for critic in tqdm(test_set):
    remove_movie_ids = test_set[critic]
    for movie_id in remove_movie_ids:
        remove_ids += list(sample_critics[(sample_critics['critic_name']==critic)&(sample_critics['rotten_tomatoes_link']==movie_id)].index)

sample_critics.drop(remove_ids, inplace=True)

100%|██████████| 100/100 [01:02<00:00,  1.59it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [30]:
# Import VADER sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [31]:
# Uniform sentiment scores: [-1, 1] -> +1 -> [0, 2] -> /2 -> [0, 1]
def normalize(score):
    return (score+1)/2

sample_critics['sentiment_score'] = sample_critics['review_content'].apply(lambda x: normalize(sid.polarity_scores(x)['compound']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_critics['sentiment_score'] = sample_critics['review_content'].apply(lambda x: normalize(sid.polarity_scores(x)['compound']))


In [32]:
critics = critics.values

### 2. Evaluation

### 2.1 Baseline: Content-based RS

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
df_movies = pd.read_csv("rotten_tomatoes/rotten_tomatoes_movies.csv")

In [68]:
# Select df for popular movies
df_movies = df_movies[df_movies["rotten_tomatoes_link"].isin(df_popular_movies["rotten_tomatoes_link"].unique())]

In [69]:
# TF-IDF vectoriser
tfidf = TfidfVectorizer(stop_words="english")
df_movies["movie_info"] = df_movies["movie_info"].fillna("")
info_matrix = tfidf.fit_transform(df_movies["movie_info"])
info_matrix.shape

(1567, 14008)

In [70]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
similarity_matrix_CB = linear_kernel(info_matrix, info_matrix)
movies_link = df_movies["rotten_tomatoes_link"]

In [71]:
columns = movies_link.unique()

def generateFilmVector(user):
    user_df = sample_critics.groupby('critic_name').get_group(user)
    films = user_df['rotten_tomatoes_link'].unique()
    user_vec = None
    for film in films:
        # Find the index of the film in the vector
        index = np.where(columns == film)[0][0]
        if type(user_vec) == type(None):
            user_vec = similarity_matrix_CB[index]
        else:
            user_vec += similarity_matrix_CB[index]
    user_vec[np.isin(columns, sample_critics[sample_critics['critic_name'] == critics[0]]['rotten_tomatoes_link'].unique())] = 0
    return user_vec

user_film_matrix_cb = []
for critic in tqdm(critics):
    user_film_matrix_cb.append(generateFilmVector(critic))

100%|██████████| 100/100 [00:04<00:00, 21.02it/s]


In [74]:
recommend_list_cb = {}

for critic in tqdm(critics):
    index = list(critics).index(critic)
    recommend_list_cb[critic] = sorted(zip(columns, user_film_matrix_cb[0]), key=lambda x: x[1], reverse=True)[:test_set[critic].size]

100%|██████████| 100/100 [00:00<00:00, 1505.09it/s]


In [73]:
# Calculate the accuracy of prediction
count = 0
for critic in critics:
    for movie_id, score in recommend_list_cb[critic]:
        if movie_id in test_set[critic]:
            count += 1

test_set_size = 0
for critic in test_set:
    test_set_size += test_set[critic].size

count/test_set_size

0.08406005574035781

### 2.2 Baseline: Traditional collaborative filtering RS

In [41]:
# Initialize an empty data frame for user film matrix
# columns = np.insert(sample_critics['rotten_tomatoes_link'].unique(), 0, 'user');
# user_film_matrix = pd.DataFrame(columns=columns);
columns = sample_critics['rotten_tomatoes_link'].unique()

def generateFilmVector(user):
    user_df = sample_critics.groupby('critic_name').get_group(user)
    user_vec = np.zeros(columns.size, dtype='float')
    films = user_df['rotten_tomatoes_link'].unique()
    for film in films:
        # Find the index of the film in the vector
        index = np.where(columns == film)[0][0]
        # Get the mode as review type
        review_score = np.average(user_df.groupby('rotten_tomatoes_link').get_group(film)['review_score'])
        # Assign value
        user_vec[index] = review_score
    return user_vec


user_film_matrix_cf = []
for critic in tqdm(critics):
    user_film_matrix_cf.append(generateFilmVector(critic))

100%|██████████| 100/100 [00:42<00:00,  2.38it/s]


In [42]:
user_film_matrix_cf = np.array(user_film_matrix_cf)

In [47]:
from numpy.linalg import norm;
user_similarity_matrix_cf = np.array([np.dot(user_film_matrix_cf, user_film_matrix_cf[i])/(norm(user_film_matrix_cf, axis=1)*norm(user_film_matrix_cf[i])) for i in range(100)])

In [48]:
movie_ids = df_popular_movies['rotten_tomatoes_link'].unique()

In [49]:
# Get watched movie list of a critic
def getWatchedSetOf(critic):
    critic_df = sample_critics.groupby('critic_name').get_group(critic)
    return set(critic_df['rotten_tomatoes_link'].values)

def getAverageScoreOf(ciritc):
    return np.average(sample_critics.groupby('critic_name').get_group(ciritc)['review_score'].values)

def getTop10NeighborsWithScore(vector):
    simialr_users = dict(zip(critics, vector))
    simialr_users = sorted(simialr_users.items(), key=lambda x: x[1], reverse=True)
    top_10_neighbors = list(simialr_users)[1:11]
    return top_10_neighbors

def recommendScoreOf(uid, vector):
    top_10_neighbors = getTop10NeighborsWithScore(vector)
    avg_score_u = getAverageScoreOf(uid)
    u_watched = getWatchedSetOf(uid)
    mids = set(movie_ids).difference(u_watched) # Get the movies that haven't been watched by user u
    scores = {}
    
    for mid in mids:
        r_index = list(movie_ids).index(mid)
        numerator, denominator = 0, 0
        for vid, similarity in top_10_neighbors:
            v_watched = getWatchedSetOf(vid)
            v_index = list(critics).index(vid)
            if mid in v_watched:
                avg_score_v = getAverageScoreOf(vid)
                v_m_score = user_film_matrix_cf[v_index][r_index]
                numerator += similarity*(v_m_score-avg_score_v)
                denominator += abs(similarity)
        scores[mid] = avg_score_u+(numerator/denominator) if denominator else 0 # 0 represents i doesn't appear in any of the neighbours's history
    
    return scores

In [50]:
recommend_list_cf = {}

for critic in tqdm(critics):
    index = list(critics).index(critic)
    recommend_list_cf[critic] = sorted(recommendScoreOf(critic, user_film_matrix_cf[index]).items(), key=lambda x: x[1], reverse=True)[:test_set[critic].size]

100%|██████████| 100/100 [1:14:39<00:00, 44.80s/it]


In [51]:
# Calculate the accuracy of prediction
count = 0
for critic in critics:
    for movie_id, score in recommend_list_cf[critic]:
        if movie_id in test_set[critic]:
            count += 1

In [52]:
test_set_size = 0
for critic in test_set:
    test_set_size += test_set[critic].size

In [53]:
count/test_set_size

0.23213161916749078

### 2.3 SCF RS (Note: use sentiemnt scores to replace ratings)

In [54]:
# Initialize an empty data frame for user film matrix
# columns = np.insert(sample_critics['rotten_tomatoes_link'].unique(), 0, 'user');
# user_film_matrix = pd.DataFrame(columns=columns);
columns = sample_critics['rotten_tomatoes_link'].unique()

def generateFilmVector(user):
    user_df = sample_critics.groupby('critic_name').get_group(user)
    user_vec = np.zeros(columns.size, dtype='float')
    films = user_df['rotten_tomatoes_link'].unique()
    for film in films:
        # Find the index of the film in the vector
        index = np.where(columns == film)[0][0]
        # Get the mode as review type
        review_score = np.average(user_df.groupby('rotten_tomatoes_link').get_group(film)['sentiment_score'])
        # Assign value
        user_vec[index] = review_score
    return user_vec


user_film_matrix_scf = []
for critic in tqdm(critics):
    user_film_matrix_scf.append(generateFilmVector(critic))

100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


In [55]:
user_film_matrix_scf = np.array(user_film_matrix_scf)

In [56]:
user_film_matrix_scf

array([[0.     , 0.     , 0.     , ..., 0.     , 0.3363 , 0.     ],
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [0.     , 0.     , 0.29905, ..., 0.     , 0.     , 0.     ],
       ...,
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.88915],
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [0.     , 0.     , 0.6423 , ..., 0.     , 0.     , 0.     ]])

In [57]:
from numpy.linalg import norm
user_similarity_matrix = np.array([np.dot(user_film_matrix_scf, user_film_matrix_scf[i])/(norm(user_film_matrix_scf, axis=1)*norm(user_film_matrix_scf[i])) for i in range(100)])

In [58]:
movie_ids = df_popular_movies['rotten_tomatoes_link'].unique()

In [59]:
# Get watch list of a critic
def getWatchedSetOf(critic):
    critic_df = sample_critics.groupby('critic_name').get_group(critic)
    return set(critic_df['rotten_tomatoes_link'].values)

def getAverageScoreOf(ciritc):
    return np.average(sample_critics.groupby('critic_name').get_group(ciritc)['sentiment_score'].values)

def getTop10NeighborsWithScore(vector):
    simialr_users = dict(zip(critics, vector))
    simialr_users = sorted(simialr_users.items(), key=lambda x: x[1], reverse=True)
    top_10_neighbors = list(simialr_users)[1:11]
    return top_10_neighbors

def recommendScoreOf(uid, vector):
    top_10_neighbors = getTop10NeighborsWithScore(vector)
    avg_score_u = getAverageScoreOf(uid)
    u_watched = getWatchedSetOf(uid)
    mids = set(movie_ids).difference(u_watched) # Get the movies that haven't been watched by user u
    scores = {}
    
    for mid in mids:
        r_index = list(movie_ids).index(mid)
        numerator, denominator = 0, 0
        for vid, similarity in top_10_neighbors:
            v_watched = getWatchedSetOf(vid)
            v_index = list(critics).index(vid)
            if mid in v_watched:
                avg_score_v = getAverageScoreOf(vid)
                v_m_score = user_film_matrix_scf[v_index][r_index]
                numerator += similarity*(v_m_score-avg_score_v)
                denominator += abs(similarity)
        scores[mid] = avg_score_u+(numerator/denominator) if denominator else 0 # 0 represents i doesn't appear in any of the neighbours's history
    
    return scores;

In [60]:
recommend_list_scf = {}

for critic in tqdm(critics):
    index = list(critics).index(critic)
    recommend_list_scf[critic] = sorted(recommendScoreOf(critic, user_film_matrix_scf[index]).items(), key=lambda x: x[1], reverse=True)[:test_set[critic].size]

100%|██████████| 100/100 [1:14:24<00:00, 44.65s/it]


In [61]:
# Calculate the accuracy of prediction
count_scf = 0
for critic in critics:
    for movie_id, score in recommend_list_scf[critic]:
        if movie_id in test_set[critic]:
            count_scf += 1

In [62]:
test_set_size_scf = 0
for critic in test_set:
    test_set_size_scf += test_set[critic].size

In [63]:
count_scf/test_set_size_scf

0.19383259911894274