In [None]:
%load_ext autoreload
%autoreload 2

In [24]:
import spacy
import numpy as np

def retrieve_movie_list():
    movies = []
    with open("movie_titles.txt") as f:
        lines = f.readlines()
        for line in lines:
            title = line.strip()
            movies.append(title)
    return movies

def get_movie_plot(movie_name: str) -> str:
    plotPath = fr"Wikipedia Plots/{movie_name}.txt"
    plot = ""
    with open(plotPath, encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            if line.isspace():
                continue
            plot += line.strip()
    return plot

def get_movie_reviews(movie_name: str) -> str:
    reviewsPath = fr"IMDB Reviews/{movie_name}.txt"
    reviews = ""
    with open(reviewsPath, encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            if line.isspace():
                continue
            reviews += line.strip()
    return reviews

def get_significant_word_counts(words: str, desired_pos: list[str]) -> dict[str, int]:
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(words)

    token_counts = {}

    for token in doc:
        
        if token.pos_ not in desired_pos:
            continue

        t = token.lemma_
        if t not in token_counts:
            token_counts[t] = 0
        token_counts[t] += 1

    return token_counts

def get_plot_vector(movie, desired_pos, wv):
    total = np.zeros(300, dtype=float)

    plot = get_movie_plot(movie)
    word_counts = get_significant_word_counts(plot, desired_pos)

    for word, count in word_counts.items():
        try:
            this_embedding = wv[word]
        except:
            continue
        weighted = count * this_embedding
        total += weighted

    magnitude = np.linalg.norm(total)
    normalized = total / magnitude

    return normalized

def get_reviews_vector(movie, desired_pos, wv):
    total = np.zeros(300, dtype=float)

    reviews = get_movie_reviews(movie)
    word_counts = get_significant_word_counts(reviews, desired_pos)

    for word, count in word_counts.items():
        try:
            this_embedding = wv[word]
        except:
            continue
        weighted = count * this_embedding
        total += weighted

    magnitude = np.linalg.norm(total)
    normalized = total / magnitude

    return normalized

def cosine_similarity(A, B):
    # Calculate dot product
    dot_product = sum(a*b for a, b in zip(A, B))

    # Calculate the magnitude of each vector
    magnitude_A = sum(a*a for a in A)**0.5
    magnitude_B = sum(b*b for b in B)**0.5

    # Compute cosine similarity
    cosine_similarity = dot_product / (magnitude_A * magnitude_B)
    return cosine_similarity

In [None]:
import gensim.downloader as api

try:
    wv
except:
    print("loading word embeddings")
    wv = api.load('word2vec-google-news-300')
finally:
    print("word embeddings loaded")

plot_pos = ["NOUN", "VERB", "ADJ"]
reviews_pos = ["ADJ"]

movie_list = retrieve_movie_list()
movie_plot_vectors = {}
movie_reviews_vectors = {}
for movie in movie_list:
    print(movie)
    movie_plot_vectors[movie] = get_plot_vector(movie, plot_pos, wv)
    movie_reviews_vectors[movie] = get_reviews_vector(movie, reviews_pos, wv)

In [None]:
plot_similarities = {}

for a_movie in movie_list:
    if a_movie not in plot_similarities:
        plot_similarities[a_movie] = {}

    for b_movie in movie_list:
        A = movie_plot_vectors[a_movie]
        B = movie_plot_vectors[b_movie]
        s = cosine_similarity(A, B)
        print(f"P: {a_movie} vs. {b_movie} -> {s}")
        plot_similarities[a_movie][b_movie] = s

In [None]:
reviews_similarities = {}

for a_movie in movie_list:
    if a_movie not in reviews_similarities:
        reviews_similarities[a_movie] = {}

    for b_movie in movie_list:
        A = movie_reviews_vectors[a_movie]
        B = movie_reviews_vectors[b_movie]
        s = cosine_similarity(A, B)
        print(f"R: {a_movie} vs. {b_movie} -> {s}")
        reviews_similarities[a_movie][b_movie] = s