In [17]:
import pandas as pd

top = pd.read_csv("Data\\imdb_top_1000.csv")
movie_data = top[["Series_Title", "Released_Year", "Genre", "IMDB_Rating", "Meta_score"]]

In [None]:
from imdb import IMDb

ia = IMDb()
movie_ids = {}

for index, row in movie_data.iterrows():
    
    title = row["Series_Title"]
    release_year = int(row["Released_Year"])

    print(f"{title} ({release_year})")

    movieID = None
    for i in range(10):
        try:
            search_results = ia.search_movie(title)
            
            # try to get the exact movie
            for result in search_results:
                    if result['year'] == release_year:
                        movieID = result.movieID
                        break
            # get closest match if not found
            if not movieID:
                closest = sorted(search_results, key=lambda x: abs(x['year'] - release_year))
                movieID = closest[0].movieID
        except:
             continue
        break

    movie_ids[index] = movieID
    

In [27]:
def get_movie_reviews(movie):
    reviews_content = ""

    for review in movie['reviews']:
        title = review['title']
        content = review['content']
        reviews_content += f"{title} {content} "
    
    return reviews_content

In [None]:
movie_plots = {}
movie_reviews = {}

for index, id in movie_ids.items():

    print(id)
    if not id:
        continue

    for i in range(10):
        try:
            movie = ia.get_movie(id, info=['synopsis', 'reviews'])
            reviews = get_movie_reviews(movie)
            plot = movie['synopsis'][0]
        except:
            continue
        break
    else:
        print(f"Could not retrieve data for {id})")
        continue
    
    movie_plots[id] = plot
    movie_reviews[id] = reviews


In [78]:
import spacy 

nlp = spacy.load("en_core_web_sm")

open_word_classes = ["NOUN", "VERB", "ADV", "ADJ"]

keyword_list = set()
keywords = {}

for id, plot in movie_plots.items():
    keywords[id] = set()
    
    doc = nlp(plot)

    for token in doc:
        if token.pos_ not in open_word_classes:
            continue

        word = token.lemma_

        keyword_list.add(word)
        keywords[id].add(word)



In [79]:
movie_genres = movie_data["Genre"]

all_genres = set()
id_genres = {}

for index, genres in enumerate(movie_genres):

    if index not in movie_ids or movie_ids[index] not in movie_plots:
        continue

    movie_id = movie_ids[index]
    id_genres[movie_id] = []

    g = genres.split(",")
    for genre in g:
        gg = genre.strip()
        all_genres.add(gg)
        id_genres[movie_id].append(gg)



In [80]:
master_genres = list(all_genres)

idx_genres = {}
for idx, genre in enumerate(master_genres):
    idx_genres[genre] = idx

master_keywords = list(keyword_list)

idx_keywords = {}
for idx, keyword in enumerate(master_keywords):
    idx_keywords[keyword] = idx


In [81]:
import numpy as np

k = len(master_genres)
g = len(master_keywords)

movie_genre_vectors = {}

for id, genres in id_genres.items():
    vec = np.zeros(k)
    for genre in genres:
        idx = idx_genres[genre]
        vec[idx] = 1
    movie_genre_vectors[id] = vec

movie_keyword_vectors = {}

for id, keywords in keywords.items():
    vec = np.zeros(g)
    for keyword in keywords:
        idx = idx_keywords[keyword]
        vec[idx] = 1
    movie_keyword_vectors[id] = vec

In [82]:
master_ids = list(movie_plots.keys())

In [83]:
matrix = np.zeros((k, g))

genre_counts = {}

for id in master_ids:
    kvec = movie_keyword_vectors[id]
    gvec = movie_genre_vectors[id]
    for idx, g in enumerate(gvec):
        if g == 1:
            genre = master_genres[idx]
            
            if genre not in genre_counts:
                genre_counts[genre] = 0
            genre_counts[genre] += 1

            matrix[idx] += kvec


In [None]:
genre_counts

In [85]:
e = {}
e["genre_counts"] = genre_counts
e['matrix'] = matrix
e['master_genres'] = master_genres
e['master_ids'] = master_ids
e['master_keywords'] = master_keywords
e['movie_keyword_vectors'] = movie_keyword_vectors
e['movie_genre_vectors'] = movie_genre_vectors

In [91]:
import pickle

with open("Data\\save.bin", "wb+") as f:
    pickle.dump(e, f)

In [107]:
with open("Data\\save.bin", "rb") as f:
    e = pickle.load(f)

In [129]:
matrix = e['matrix'].copy()
transpose = matrix.T

threshold = 15
too_few = []

for ridx, row in enumerate(transpose):
    
    occurrences = sum(row)
    if occurrences < threshold:
        too_few.append(ridx)

    for cidx, elem in enumerate(row):
        genre = master_genres[cidx]
        n = elem / genre_counts[genre]
        transpose[ridx, cidx] = n

transpose

array([[0.        , 0.00481541, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00160514, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00160514, 0.        , ..., 0.02380952, 0.        ,
        0.        ],
       ...,
       [0.        , 0.00321027, 0.01020408, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.03333333,
        0.        ],
       [0.        , 0.00160514, 0.01020408, ..., 0.        , 0.        ,
        0.        ]])

In [130]:
len(master_keywords) - len(too_few)

5945

In [131]:
embeddings = {}

for idx, keyword in enumerate(master_keywords):
    if idx in too_few:
        continue

    vec = transpose[idx]
    embeddings[keyword] = vec

In [132]:
with open(f"Data\\keyword_embeddings_{threshold}.bin", "wb+") as f:
    pickle.dump(embeddings, f)

In [None]:
# TODO: remove vectors that are too middle of the road (have similar numbers for each genre because they are common words, like 'do'), they might skew the results later