# Movie Recommender Models

In [2]:
pip install contractions

Collecting contractions
  Downloading contractions-0.0.55-py2.py3-none-any.whl (7.9 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 4.1 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 33.9 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85445 sha256=084abdd77f45bb32c2f89ec1b2249677f8b9376178ba40dc644b4a0935792f4a
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully install

In [2]:
import pandas as pd
import json
import nltk
import re
import numpy as np
import contractions
from datetime import date
from gensim.models import FastText
import pickle
from sklearn.metrics.pairwise import cosine_similarity

## Preparing the Corpus

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Helper Functions

def get_movie_genres(movies_df):
    movie_genres = []
    for i in range(len(movies_df)):
        genre_list = json.loads(movies_df.genres[i])
        genres = []
        for genre in genre_list:
            genres.append(genre["name"])

        genre_string = " ".join(genres)
        movie_genres.append(genre_string)
    return movie_genres

def prepare_movie_descriptions(df):
    df_2 = df.copy()
    df_2 = df_2.loc[df_2["overview"].notna()]
    df_2["tagline"] = df_2["tagline"].fillna("")
    df_2["genres_string"] = df_2["genres_string"].fillna("")
    df_2["description"] = df_2["tagline"] + " " + df_2["overview"] + " " + df_2["genres_string"]
    df_2 = df_2[["original_title", "description"]]

    return df_2

In [4]:
# Reading in the data and basic cleaning
colab_old_movies = "/content/drive/MyDrive/Colab_Notebooks/cineman_streamlit_app/tmdb_5000_movies.csv"
colab_recent_movies = "/content/drive/MyDrive/Colab_Notebooks/cineman_streamlit_app/2021-10-29_zurich_movie_overviews.csv"

local_old_movies = "../data/external/tmdb_5000_movies.csv"
#local_recent_movies = f"../data/raw/{date.today()}_zurich_movie_overviews.csv"
local_recent_movies = f"../data/raw/2021-10-29_zurich_movie_overviews.csv"

movie_reviews_old = pd.read_csv(local_old_movies)
movie_reviews_old["genres_string"] = get_movie_genres(movie_reviews_old)
movie_reviews_old = movie_reviews_old.sort_values("popularity", ascending=False).drop_duplicates("original_title").reset_index()

movie_reviews_recent = pd.read_csv(local_recent_movies)

In [5]:
movie_reviews_old_2 = prepare_movie_descriptions(movie_reviews_old)
print(movie_reviews_old_2.shape)
movie_reviews_old_2.head(3)

(4798, 2)


Unnamed: 0,original_title,description
0,Minions,"Before Gru, they had a history of bad bosses M..."
1,Interstellar,Mankind was born on Earth. It was never meant ...
2,Deadpool,Witness the beginning of a happy ending Deadpo...


In [6]:
movie_reviews_recent_2 = prepare_movie_descriptions(movie_reviews_recent)
print(movie_reviews_recent_2.shape)
movie_reviews_recent_2.head(3)

(38, 2)


Unnamed: 0,original_title,description
0,Wild,A woman with a tragic past decides to start h...
1,Azor,"Yvan De Wiel, a private banker from Geneva, i..."
2,"Quo Vadis, Aida?","Bosnia, July 1995. Aida is a translator for t..."


In [7]:
movie_reviews_df = pd.concat([movie_reviews_old_2, movie_reviews_recent_2]
                            ).drop_duplicates("original_title", keep="last").reset_index(drop=True)

movie_reviews_df.shape

(4833, 2)

In [9]:
# Clean and normalize the text

#nltk.download('stopwords')
#nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # remove special characters
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, flags=re.I|re.A)
    # lower case
    doc = doc.lower()
    # strip whitespaces
    doc = doc.strip()
    # fix contractions
    doc = contractions.fix(doc)
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    #filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(movie_reviews_df['description']))
print(len(norm_corpus))
norm_corpus[4]

4833


'lovely day apocalyptic story set furthest reaches planet stark desert landscape humanity broken everyone crazed fighting necessities life within world exist two rebels run might able restore order max man action man words seeks peace mind following loss wife child aftermath chaos furiosa woman action woman believes path survival may achieved make across desert back childhood homeland action adventure science fiction thriller'

In [12]:
tokenized_docs = [doc.split() for doc in norm_corpus]

## Fast Text Model

In [217]:
#ft_model = FastText(tokenized_docs, vector_size=300, window=30, min_count=2, workers=4, sg=1, epochs=50)

In [3]:
#colab_modelpath = '/content/drive/MyDrive/Colab_Notebooks/cineman_streamlit_app/fast_text_model.sav'
#pickle.dump(ft_model, open(local_modelpath, 'wb'))

local_modelpath = "../models/fast_text_model.sav"

ft_model = pickle.load(open(local_modelpath, 'rb'))

In [13]:
import os, time
time.time() - os.path.getmtime(local_modelpath) > 600000

False

In [36]:
# Turning word level embeddings into document embeddings

def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [220]:
doc_vecs_ft = averaged_word2vec_vectorizer(tokenized_docs, ft_model, 300)
doc_vecs_ft.shape

(4833, 300)

In [221]:
doc_sim = cosine_similarity(doc_vecs_ft)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4823,4824,4825,4826,4827,4828,4829,4830,4831,4832
0,1.0,0.517415,0.510955,0.551631,0.561793,0.557224,0.525423,0.478245,0.460589,0.577626,...,0.590079,0.53504,0.565696,0.535177,0.534683,0.516525,0.491735,0.563811,0.52908,0.535516
1,0.517415,1.0,0.568766,0.623255,0.638681,0.601999,0.54206,0.642528,0.589002,0.563567,...,0.531744,0.550824,0.567199,0.539954,0.514471,0.556247,0.541075,0.55987,0.506336,0.556237
2,0.510955,0.568766,1.0,0.605084,0.610586,0.501114,0.528258,0.551192,0.55724,0.609948,...,0.564113,0.54267,0.54787,0.531231,0.541058,0.554508,0.5593,0.543766,0.53589,0.504513
3,0.551631,0.623255,0.605084,1.0,0.650314,0.585077,0.527648,0.606546,0.613233,0.607943,...,0.59094,0.531167,0.567877,0.568459,0.545989,0.585187,0.579291,0.548378,0.50585,0.500467
4,0.561793,0.638681,0.610586,0.650314,1.0,0.619522,0.585781,0.725705,0.634724,0.596203,...,0.647371,0.616543,0.679341,0.60222,0.594075,0.641582,0.678406,0.50006,0.60505,0.625272


In [263]:
all_movies = movie_reviews_df['original_title'].values
recent_movies = movie_reviews_recent_2["original_title"].values
recent_movie_idx = movie_reviews_df[movie_reviews_df["original_title"].isin(recent_movies)].index

def recent_movie_recommender(fav_movie, all_movies=all_movies, recent_indices = recent_movie_idx, 
                             similarities=doc_sim_df):
    movie_idx = np.where(all_movies == fav_movie)[0][0]
    movie_similarities = similarities.iloc[movie_idx].values
    similar_movies = np.argsort(-movie_similarities)
    similar_recent_movies = [index for index in similar_movies if index in recent_indices]
    movie_rec = all_movies[similar_recent_movies][0]
    if movie_rec == fav_movie:
        movie_rec = all_movies[similar_recent_movies][1]
    
    return movie_rec

In [274]:
fav_movie = "Despicable Me"
rec_movie = recent_movie_recommender(fav_movie)
overview = movie_reviews_recent["overview"][movie_reviews_recent["original_title"]==rec_movie].values[0]

print("Recommended movie:", rec_movie)
print("Overview:", overview)

Recommended movie: Contra
Overview: With his uncouth manner, Professor Pohl doesn’t enjoy the best reputation at the university where he works. To make matters worse, he makes a racist remark when student Naima arrives late for his lecture. Unfortunately for him, a video of his verbal outburst spreads like wildfire on the Internet, and the university’s president, Lambrecht, is called to deal with the incident. But Pohl is not dismissed. Instead, he is put to the task of helping Naima prepare for an important debating competition. The lecturer and the student could not be more different, yet they are thrown together to work as a team.


In [275]:
movie_reviews_df[movie_reviews_df["original_title"]==fav_movie].values

array([['Despicable Me',
        'Superbad. Superdad. Villainous Gru lives up to his reputation as a despicable, deplorable and downright unlikable guy when he hatches a plan to steal the moon from the sky. But he has a tough time staying on task after three orphans land in his care. Animation Family']],
      dtype=object)

In [276]:
movie_reviews_df[movie_reviews_df["original_title"]==rec_movie].values

array([['Contra',
        ' With his uncouth manner, Professor Pohl doesn’t enjoy the best reputation at the university where he works. To make matters worse, he makes a racist remark when student Naima arrives late for his lecture. Unfortunately for him, a video of his verbal outburst spreads like wildfire on the Internet, and the university’s president, Lambrecht, is called to deal with the incident. But Pohl is not dismissed. Instead, he is put to the task of helping Naima prepare for an important debating competition. The lecturer and the student could not be more different, yet they are thrown together to work as a team. ']],
      dtype=object)

## Pretrained FastText Model

In [62]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
     |████████████████████████████████| 68 kB 2.1 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.8.1-py2.py3-none-any.whl (208 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp38-cp38-linux_x86_64.whl size=4651742 sha256=e0f80e92a5388515ff8f1a1eafdde56874bff876f74876888dc2a16654b614d2
  Stored in directory: /home/angela/.cache/pip/wheels/93/61/2a/c54711a91c418ba06ba195b1d78ff24fcaad8592f2a694ac94
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.8.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

In [64]:
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_model 

#cap_path = datapath("cc.en.300.bin")
ft_model = load_facebook_model('cc.en.300.bin.gz')

KeyboardInterrupt: 

In [34]:
ft_model.build_vocab(tokenized_docs, update=True)
ft_model.train(tokenized_docs, total_examples=len(tokenized_docs), epochs=20)

(2602016, 3471300)

In [None]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [57]:
ft_model.wv["gru"]

array([-0.15499775, -0.11057983,  0.4979684 ,  1.5159875 ,  0.5288776 ],
      dtype=float32)

In [56]:
vocabulary = set(ft_model.wv.index_to_key)
feature_vector = np.zeros((300,), dtype="float64")
nwords = 0.

for word in tokenized_docs[0]:
    if word in vocabulary:
        nwords = nwords + 1.
        feature_vector = np.add(feature_vector, ft_model.wv[word])

ValueError: operands could not be broadcast together with shapes (300,) (5,) 

In [37]:
doc_vecs_ft = averaged_word2vec_vectorizer(tokenized_docs, ft_model, 300)
doc_vecs_ft.shape

ValueError: operands could not be broadcast together with shapes (300,) (5,) 

In [None]:
doc_sim = cosine_similarity(doc_vecs_ft)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()