In [22]:
import spacy
import numpy as np

def retrieve_movie_list():
    movies = []
    with open("movie_titles.txt") as f:
        lines = f.readlines()
        for line in lines:
            title = line.strip()
            movies.append(title)
    return movies

def get_movie_plot(movie_name: str) -> str:
    plotPath = fr"Wikipedia Plots/{movie_name}.txt"
    plot = ""
    with open(plotPath, encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            if line.isspace():
                continue
            plot += line.strip()
    return plot

def get_movie_reviews(movie_name: str) -> str:
    reviewsPath = fr"IMDB Reviews/{movie_name}.txt"
    reviews = ""
    with open(reviewsPath, encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            if line.isspace():
                continue
            reviews += line.strip()
    return reviews

def get_significant_word_counts(words: str, desired_pos: list[str]) -> dict[str, int]:
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(words)

    token_counts = {}

    for token in doc:
        
        if token.pos_ not in desired_pos:
            continue

        t = token.lemma_
        if t not in token_counts:
            token_counts[t] = 0
        token_counts[t] += 1

    return token_counts

def get_plot_vector(movie, desired_pos, wv):
    total = np.zeros(300, dtype=float)

    plot = get_movie_plot(movie)
    word_counts = get_significant_word_counts(plot, desired_pos)

    for word, count in word_counts.items():
        try:
            this_embedding = wv[word]
        except:
            continue
        weighted = count * this_embedding
        total += weighted

    magnitude = np.linalg.norm(total)
    normalized = total / magnitude

    return normalized

def get_reviews_vector(movie, desired_pos, wv):
    total = np.zeros(300, dtype=float)

    reviews = get_movie_reviews(movie)
    word_counts = get_significant_word_counts(reviews, desired_pos)

    for word, count in word_counts.items():
        try:
            this_embedding = wv[word]
        except:
            continue
        weighted = count * this_embedding
        total += weighted

    magnitude = np.linalg.norm(total)
    normalized = total / magnitude

    return normalized

def cosine_similarity(A, B):
    # Calculate dot product
    dot_product = sum(a*b for a, b in zip(A, B))

    # Calculate the magnitude of each vector
    magnitude_A = sum(a*a for a in A)**0.5
    magnitude_B = sum(b*b for b in B)**0.5

    # Compute cosine similarity
    cosine_similarity = dot_product / (magnitude_A * magnitude_B)
    return cosine_similarity

def normalized_cosine_similarity(A, B):
    # Calculate dot product
    dot_product = sum(a*b for a, b in zip(A, B))

    # Calculate the magnitude of each vector
    #magnitude_A = sum(a*a for a in A)**0.5
    #magnitude_B = sum(b*b for b in B)**0.5

    # Compute cosine similarity
    cosine_similarity = dot_product# / (magnitude_A * magnitude_B)
    return cosine_similarity

In [None]:
import gensim.downloader as api

try:
    wv
except:
    print("loading word embeddings")
    wv = api.load('word2vec-google-news-300')
finally:
    print("word embeddings loaded")

plot_pos = ["NOUN", "VERB"]     # capture the action of the plot
reviews_pos = ["ADJ", "ADV"]    # capture the description of the movie

movie_list = retrieve_movie_list()
movie_plot_vectors = {}
movie_reviews_vectors = {}
for movie in movie_list:
    print(movie)
    movie_plot_vectors[movie] = get_plot_vector(movie, plot_pos, wv)
    movie_reviews_vectors[movie] = get_reviews_vector(movie, reviews_pos, wv)

In [32]:
plot_similarity_min = 0.7
plot_similarity_scaling = 1 / (1 - plot_similarity_min)
reviews_similarity_min = 0.85
reviews_similarity_scaling = 1 / (1 - reviews_similarity_min)

In [None]:
plot_similarities = {}

for a_movie in movie_list:
    if a_movie not in plot_similarities:
        plot_similarities[a_movie] = {}

    for b_movie in movie_list:
        A = movie_plot_vectors[a_movie]
        B = movie_plot_vectors[b_movie]
        s = normalized_cosine_similarity(A, B)
        s -= plot_similarity_min
        s *= plot_similarity_scaling
        print(f"P: {a_movie} vs. {b_movie} -> {s}")
        plot_similarities[a_movie][b_movie] = s

In [None]:
reviews_similarities = {}

for a_movie in movie_list:
    if a_movie not in reviews_similarities:
        reviews_similarities[a_movie] = {}

    for b_movie in movie_list:
        A = movie_reviews_vectors[a_movie]
        B = movie_reviews_vectors[b_movie]
        s = normalized_cosine_similarity(A, B)
        s -= reviews_similarity_min
        s *= reviews_similarity_scaling
        print(f"R: {a_movie} vs. {b_movie} -> {s}")
        reviews_similarities[a_movie][b_movie] = s

In [39]:
# rotten tomatoes critic, rotten tomatoes user, imdb user
movie_ratings = {"Avatar": [81, 82, 7.9],
    "BlinkTwice": [74, 70, 6.5],
    "ChildrenOfMen": [92, 85, 7.9],
    "CitizenKane": [99, 90, 8.3],
    "DjangoUnchained": [87, 92, 8.5],
    "GetOut": [98, 86, 7.8],
    "Inception": [87, 91, 8.8],
    "KnivesOut": [97, 92, 7.9],
    "Mandy": [90, 67, 6.5],
    "Parasite": [99, 90, 8.5],
    "SnowPiercer": [94, 72, 7.1],
    "TheGreatestShowman": [57, 86, 7.5],
    "TheMartian": [91, 91, 8.0],
    "TheMummy": [62, 75, 7.1],
    "TheRevenant": [78, 84, 8.0],
    "TheRitual": [74, 63, 6.3],
    "Lord of the Rings\\FellowshipOfTheRing": [92, 95, 8.9],
    "Lord of the Rings\\ReturnOfTheKing": [94, 86, 9.0],
    "Lord of the Rings\\TheTwoTowers": [95, 95, 8.8],
    "Pirates of the Caribbean\\AtWorldsEnd": [44, 72, 7.1],
    "Pirates of the Caribbean\\CurseOfTheBlackPearl": [80, 86, 8.1],
    "Pirates of the Caribbean\\DeadMansChest": [53, 72, 7.4],
    "Pirates of the Caribbean\\DeadMenTellNoTales": [30, 60, 6.5],
    "Pirates of the Caribbean\\OnStrangerTides": [33, 54, 6.6],
    "Star Wars\\StarWarsANewHope": [93, 96, 8.6],
    "Star Wars\\StarWarsEmpireStrikesBack": [95, 97, 8.7],
    "Star Wars\\StarWarsReturnOfTheJedi": [82, 94, 8.3],
    "The Godfather\\GodfatherPart1": [97, 98, 9.2],
    "The Godfather\\GodfatherPart2": [96, 97, 9.0],
    "The Godfather\\GodfatherPart3": [66, 68, 7.6],
}

In [None]:
ratings = {}

user_weight = 0.9
critic_weight = 1 - user_weight

for movie in movie_list:
    scores = movie_ratings[movie]
    rot_tom_critics = float(scores[0])/100
    rot_tom_audience = float(scores[1])/100
    imdb_users = float(scores[2]) / 10
    
    user_rating = (rot_tom_audience + imdb_users) / 2
    overall = user_rating * user_weight + rot_tom_critics * critic_weight
    
    ratings[movie] = overall
    print(f"{movie} -> {overall}")


In [None]:
sentiment_scores = {}

# in this case, think of a as the unknown movie, and b as the known movie
for a_movie in movie_list:
    if a_movie not in sentiment_scores:
        sentiment_scores[a_movie] = {}

    for b_movie in movie_list:
        a_rating = ratings[a_movie]
        b_rating = ratings[b_movie]
        delta = b_rating - a_rating
        
        r_similarity = reviews_similarities[a_movie][b_movie]
        score = (1.1 + delta) * r_similarity # increases the similarity if reviews for a are better than b, with a slight upward bias, otherwise reduces it
        
        print(f"{a_movie} vs. {b_movie} -> {score}")
        sentiment_scores[a_movie][b_movie] = score

In [51]:
x_points = []
y_points = []

for a_movie in movie_list:
    for b_movie in movie_list:
        if a_movie is b_movie:
            continue
        p = plot_similarities[a_movie][b_movie]
        #r = reviews_similarities[a_movie][b_movie]
        r = sentiment_scores[a_movie][b_movie]
        x_points.append(p)
        y_points.append(r)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x_points, y_points)
plt.xlabel("Plot similarity")
plt.ylabel("Sentiment score")
plt.show()

In [74]:
# now I need to make some function Z to take plot similarity and sentiment score and compute a probability that you will like a movie
# for an unknown movie a, and a known liked movie b, with p and s as the scores, what is the probability that you will like a, given that you like b?
# then, what is the probability that you will like a, maybe I can use the sum of the conditional probabilities?
import math

p_weight = 1
r_weight = 0.5
neg_bias = 0

# sigmoid function parameters
a = 6
b = 0.5
sigmoid = lambda x: 1 / (1 + math.exp(-a*(x-b)))

z = lambda p, r: p_weight * p + r_weight * r - neg_bias
Z = lambda p, r: sigmoid(z(p, r)) 

In [75]:
probabilities = {}

for a_movie in movie_list:
    if a_movie not in probabilities:
        probabilities[a_movie] = {}
    for b_movie in movie_list:
        if a_movie is b_movie:
            probabilities[a_movie][b_movie] = 1.0
        p = plot_similarities[a_movie][b_movie]
        #r = reviews_similarities[a_movie][b_movie]
        r = sentiment_scores[a_movie][b_movie]
        final = Z(p, r)
        probabilities[a_movie][b_movie] = final

In [76]:
movie_count = len(movie_list)
matrix = np.zeros((movie_count, movie_count))

i = 0
for a_movie in movie_list:
    j = 0
    for b_movie in movie_list:
        prob = probabilities[a_movie][b_movie]
        matrix[i, j] = prob
        j += 1
    i += 1


In [80]:
def get_matrix(a, b, pw, rw, neg):
    # sigmoid function parameters
    #a = 6
    #b = 0.5
    sigmoid = lambda x: 1 / (1 + math.exp(-a*(x-b)))

    z = lambda p, r: p_weight * p + r_weight * r - neg_bias
    Z = lambda p, r: sigmoid(z(p, r)) 

    probabilities = {}

    for a_movie in movie_list:
        if a_movie not in probabilities:
            probabilities[a_movie] = {}
        for b_movie in movie_list:
            if a_movie is b_movie:
                probabilities[a_movie][b_movie] = 1.0
            p = plot_similarities[a_movie][b_movie]
            #r = reviews_similarities[a_movie][b_movie]
            r = sentiment_scores[a_movie][b_movie]
            final = Z(p, r)
            probabilities[a_movie][b_movie] = final

    movie_count = len(movie_list)
    matrix = np.zeros((movie_count, movie_count))

    i = 0
    for a_movie in movie_list:
        j = 0
        for b_movie in movie_list:
            prob = probabilities[a_movie][b_movie]
            matrix[i, j] = prob
            j += 1
        i += 1

    return matrix

In [None]:
# these parameters, except maybe a and b, need to be trained for accuracy based on real data
matrix = get_matrix(a=6, b=0.5, pw=0.7, rw=0.3, neg=0.5)
plt.imshow(matrix) 
  
plt.title("Heat Map") 
plt.colorbar()
plt.show()

In [91]:
a = "Avatar"
b = "BlinkTwice"
a_i = movie_list.index(a)
b_i = movie_list.index(b)

print(f"P(I like {a} | I like {b}) = {matrix[a_i, b_i]}")

P(I like Avatar | I like BlinkTwice) = 0.48575887530781126


In [93]:
a = "Avatar"
b = "BlinkTwice"
c = "TheMartian"
a_i = movie_list.index(a)
b_i = movie_list.index(b)
c_i = movie_list.index(c)

print(f"P(I like {a} | I like {b} and I like {c}) = {0.66 * matrix[a_i, b_i] + 0.33 * matrix[a_i, c_i]}")

P(I like Avatar | I like BlinkTwice and I like TheMartian) = 0.6084645983603445


In [94]:
n = 3

initial = [1 / x for x in range(1, 1 + n)]
total = sum(initial)
weights = [x / total for x in initial]

print(weights)

[0.5454545454545455, 0.27272727272727276, 0.18181818181818182]
