# Word2Vec v2.5: "Mistake Not"

### Connect to Database

In [328]:
! pip3 install psycopg2-binary --user
import pandas as pd
import psycopg2
import numpy as np
from getpass import getpass

[33mYou are using pip version 19.0.2, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [329]:
# connect to database
connection = psycopg2.connect(
    database  = "postgres",
    user      = "postgres",
    password  = getpass(),
    host      = "movie-rec-scrape.cvslmiksgnix.us-east-1.rds.amazonaws.com",
    port      = '5432'
)
# Enter database password below and press Enter.

 ················


In [330]:
# create cursor that is used throughout
try:
    c = connection.cursor()
    print("Connected!")
except:
    print("Connection problem chief!")

Connected!


### Prepare data and train.
1. Get the list of reviewers whose reviews we want (about 17k)
2. Get the dataframe of reviewers, movie IDs with positive reviews
3. Inner join the above two dataframes.
4. Run the list constructor on the join table to construct the training data.
    - Training data is of this format: [['movieid1', 'movieid2', ...], ...]
5. Train Word2Vec on the list of watch histories (which are themselves lists of movie IDs).
6. Save the model.

In [72]:
# Get reviewers with at least 10 positive reviews (rating 7-10 inclusive)
c.execute("""
select username
from reviews
where user_rating between 7 and 10
group by username
having count(username) >= 10
order by count(username) desc
""")

reviewers = c.fetchall()

In [74]:
# Get positive reviews from database
c.execute("SELECT movie_id, username FROM reviews WHERE user_rating > 6")
result = c.fetchall()

# create reviews dataframe
df = pd.DataFrame(result, columns = ['movieid', 'userid'])
df.head()

Unnamed: 0,movieid,userid
0,5493944,dmldc
1,95016,immortal_saint1
2,5493944,vampyr_vashti
3,5493944,julieclowes
4,5493944,stephgonser


In [76]:
# create reviewers dataframe
df_reviewers = pd.DataFrame(reviewers, columns = ['userid'])

In [77]:
# merge to get only the IDs relevant to training
df = df.merge(df_reviewers, how='inner', on='userid')
df.shape

(904140, 2)

In [22]:
# ! sudo su
# ! yum update -y
# ! yum -y install python-pip
# ! python -V

Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper,
              : versionlock
You need to be root to perform this command.
Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper,
              : versionlock
You need to be root to perform this command.
Python 3.6.5 :: Anaconda, Inc.


In [23]:
# ! which pip

/home/ec2-user/anaconda3/envs/python3/bin/pip


# Install gensim

In [4]:
! python -m pip install tqdm
# ! python -c 'import tqdm'
! python -m pip install gensim

[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [82]:
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [94]:
# list to capture watch history of the users
watched_train = []

# populate the list with the movie codes
for i in tqdm(reviewers):
    temp = df[df["userid"] == i[0]]["movieid"].tolist()
    watched_train.append(temp)
    
len(watched_train)

100%|██████████| 17812/17812 [44:07<00:00,  6.73it/s]


17812

In [95]:
# save the model for later
import pickle
pickle.dump(watched_train, open('watched_train.sav', 'wb'))

In [166]:
# #save the model in protocol 2 so it can be opened in python 2.7
# import pickle
# temp = pickle.load(open('watched_train.sav', 'rb'))
# pickle.dump(temp, open('watched_train.sav', 'wb'), protocol=2)

### Train the Model

**Important:** The previous model was trained on movie IDs that were inside lists of length 1, with watch histories being lists of lists.

This model eschews the inner lists. Each watch history is simply a list of strings.

In [96]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(watched_train, progress_per=200)

model.train(watched_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(8222420, 9041400)

In [97]:
# save word2vec model
model.save("w2v_mistakenot.model")

### Test the model

In [331]:
# load model
import gensim
model = gensim.models.Word2Vec.load("w2v_mistakenot.model")

In [332]:
# prunes the model, making it faster but unable to train any more.
model.init_sims(replace=True)

In [333]:
print(model)

Word2Vec(vocab=24784, size=100, alpha=0.03)


In [334]:
# extract all vectors
X = model[model.wv.vocab]

X.shape

(24784, 100)

In [335]:
# IDs are words in the model, and callable as such.
# model['0110912']

In [336]:
# def get_title(id):
#     """Takes an id string and returns the movie title."""
    
#     try:
#         c.execute(f"""
#         select primary_title, start_year
#         from movies
#         where movie_id = '{id}'""")
#     except:
#         return f"Movie title unknown. ID:{id}"
    
#     t = c.fetchone()
#     title = tuple([t[0], t[1], f"https://www.imdb.com/title/tt{id}/"])
#     return title
    
# def predict(model, input, num_recs=6):
#         """For the input, do the predictions and return them.

#         Args:
#             model: the word2vec model object.
#             input: a list of movie IDs.
#             num_recs: the number of recommendations to return.
#         """

#         def _aggregate_vectors(movies):
#             # get the vector average of the movies in the input.
#             # discard unrecognized IDs.
#             movie_vec = []
#             for i in movies:
#                 try:
#                     movie_vec.append(model[i])
#                 except KeyError:
#                     continue
#             return np.mean(movie_vec, axis=0)

#         def _similar_movies(v, n):
#             # extract most similar movies for the input vector
#             return model.similar_by_vector(v, topn= n+1)[1:]
        
#         def _remove_dupes(recs):
#             # remove any recommendations that were in the input
#             return [x for x in recs if x not in input]
        
        
        
#         # aggregate input and find similar vectors.
#         recs = _similar_movies(_aggregate_vectors(input), num_recs)
#         # get titles
#         recs = [get_title(y[0]) for y in recs] 
#         return recs

In [337]:
import pandas as pd
ratings = pd.read_csv('ratings.csv') # import my Letterboxd ratings
ratings.head()
ratings = ratings.dropna(axis=0, subset=['Rating', 'Name', 'Year'])

# set threshold for minimum "good" rating
threshold = 3.5
good = ratings[ratings['Rating'] >= threshold]
bad = ratings[ratings['Rating'] < threshold]

def df_to_id_list(df):
    """
    Input: dataframe of movies from my ratings.csv Letterboxd export
    
    Output: List of the IDs for those movies, ready for inferencing.
    """
    ids = []
    names = df.Name.tolist()
    names = [x.replace("'", "") for x in names]
    years = df.Year.tolist()
    years = [int(year) for year in years]
    info = list(zip(names, years))
    for i, j in info:
        try:
            c.execute(f"""
                SELECT movie_id
                FROM movies
                WHERE primary_title LIKE '{i}' AND start_year = {j}
                ORDER BY runtime_minutes DESC
                LIMIT 1""")
            id = c.fetchone()[0]
            ids.append(id)
        except:
            continue
    return ids

In [338]:
good_list = df_to_id_list(good)
bad_list = df_to_id_list(bad)
print(len(good_list), len(bad_list))

512 175


In [339]:
import re
test_me = 'https://www.imdb.com/title/tt0084345/'
test_me.split("/tt")[1][:-1]

'0084345'

In [369]:
class ScoringService(object):
    model = None                # Where we keep the model when it's loaded

    @classmethod
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        if cls.model == None:
            # load the gensim model
            w2v_model = gensim.models.Word2Vec.load("w2v_mistakenot.model")
            # keep only the normalized vectors.
            # This saves memory but makes the model untrainable (read-only).
            w2v_model.init_sims(replace=True)
            # with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'r') as inp:
            #     cls.model = pickle.load(inp)
            cls.model = w2v_model
        return cls.model

    @classmethod
    def predict(cls, input, bad_movies=[], n=20, harshness=1):
        """Returns a list of recommendations and useful metadata, given a pretrained 
        word2vec model and a list of movies.

        Args:
            cls (.model object): The pretrained word2vec model.
            
            input (list of strings): The list of movies that the user likes.
            
            bad_movies (list of strings): The list of movies that the user dislikes.
            
            n (int): The number of recommendations to return.
            
        Output: A list of tuples: Title, Year, IMDb URL, Similarity score.
        """
        
        # get pretrained model
        clf = cls.get_model()

        def _aggregate_vectors(movies):
            """Gets the vector average of a list of movies."""
            movie_vec = []
            for i in movies:
                try:
                    movie_vec.append(clf[i]) # get the vector for each movie
                except KeyError:
                    continue
            return np.mean(movie_vec, axis=0)

        def _similar_movies(v, bad_movies=[], n = 10):
            """Takes aggregated vector of good movies, 
            and optionally, a list of disliked movies.
            Subtracts disliked movies.
            Returns most similar movies for the input vector
            
            n: number of recommendations to return."""
            if bad_movies:
                v = _remove_dislikes(bad_movies, v, input=input, harshness=harshness)
            return clf.similar_by_vector(v, topn= n+1)[1:]
            
        def _remove_dupes(recs, input, bad_movies):
            """remove any recommended IDs that were in the input list"""
            all_seen = input + bad_movies
            return [x for x in recs if x[0] not in all_seen]

        def _get_info(id):
            """Takes an id string and returns the movie info with a url."""
            try:
                c.execute(f"""
                select primary_title, start_year
                from movies
                where movie_id = '{id[0]}'""")
            except:
                return f"Movie title unknown. ID:{id}"

            t = c.fetchone()
            title = tuple([t[0], t[1], f"https://www.imdb.com/title/tt{id[0]}/", id[1]])
            return title
                
        def _remove_dislikes(bad_movies, good_movies_vec, input=1, harshness=1):
            """Takes a list of movies that the user dislikes. 
            Their embeddings are averaged,
            and subtracted from the input."""
            bad_vec = _aggregate_vectors(bad_movies)
            bad_vec = bad_vec / harshness
            return good_movies_vec - bad_vec

        recs = _aggregate_vectors(input)
        recs = _similar_movies(recs, bad_movies, n=n)
        recs = _remove_dupes(recs, input, bad_movies)
        recs = [_get_info(x) for x in recs]
        return recs

In [370]:
# test cases

# A list of some Coen Bros movies.
coen_bros = ['116282', '2042568', '1019452', 
             '1403865', '190590', '138524', 
             '335245', '477348', '887883', '101410']

# Data scientist's recent watches.
cooper_recent = ['0053285', '0038650', '0046022', 
                 '4520988', '1605783', '6751668', 
                 '0083791', '0115685', '0051459', 
                 '8772262', '0061184', '0041959',
                 '7775622']

# dirkh public letterboxd recent watches.
dirkh = ['7975244', '8106534', '1489887', 
         '1302006', '7286456', '6751668', 
         '8364368', '2283362', '6146586', 
         '2194499', '7131622', '6857112']

# Marvin watches
marvin = ['7286456', '0816692', '2543164', '2935510', 
          '2798920', '0468569', '5013056', '1375666', 
          '3659388', '0470752', '0266915', '0092675', 
          '0137523', '0133093', '1285016']  

# Gabe watches
gabe = ['6292852','0816692','2737304','3748528',
        '3065204','4154796','1536537','1825683',
        '1375666','8236336','2488496','1772341',
        '0317705','6857112','5052448']

# Eric watches
eric = ['2974050','1595842','0118539','0093405',
        '3216920','1256535','5612742','3120314',
        '1893371','0046248','0058548','0199481',
        '2296777','0071198','0077834']

chuckie = ['4263482',
'0084787',
'3286052',
'5715874',
'1172994',
'4805316',
'3139756',
'8772262',
'7784604',
'1034415',]

harlan = ['1065073','5052448','0470752','5688932','1853728','1596363','0432283','6412452','4633694','9495224','0443453','0063823',
          '0066921','0405296','1130884','1179933','0120630','0268126','0137523','0374900','8772262','0116996','0107290','7339248']

ryan = ['0166924','2866360','0050825','2798920','3416742','0060827','1817273','0338013','0482571','5715874','2316411','4550098']

karyn = ['4425200','0464141','1465522','0093779','0099810','0076759','3748528','6763664','0317740','2798920','0096283','0258463','0118799','0058092','0107290','0045152','0106364']

richard = ['0074119','0064115','0070735','0080474','0061512','0067774','0057115','0070511','0081283',
           '0065126','0068421','0078227','0079100','0078966','0081696','0082085','0072431','0075784',
           '0093640','0098051','0094226','0097576','0099810','0081633','0080761','0077975','0085244','0095159','0101969']

joe = ['6335734','0291350','0113568','0208502','0169858','0095327','0097814','0983213','0094625','7089878']

lena = ['1990314','3236120','1816518','0241527','0097757','0268978','0467406','2543164','2245084','3741834']

wade = ['0118665','0270846','0288441','2287250','2287238','8668804','9448868','1702443','1608290','5519340']

In [371]:
s = ScoringService()
good_list_p1 = good_list[:int(len(good_list)/2)]
good_list_p2 = good_list[int(len(good_list)/2):]

In [380]:
prediction = s.predict(input=good_list_p1, bad_movies=bad_list, n=50, harshness=1)
for i in prediction:
    print(f"{i[0]}\t{i[1]}\n\t{i[2]}\n\tSimilarity: {i[3]}\n\n")

Floating Weeds	1959
	https://www.imdb.com/title/tt0053390/
	Similarity: 0.4749234914779663


A Moment of Innocence	1996
	https://www.imdb.com/title/tt0117214/
	Similarity: 0.4689294099807739


Andrei Rublev	1966
	https://www.imdb.com/title/tt0060107/
	Similarity: 0.46675533056259155


Weekend	1967
	https://www.imdb.com/title/tt0062480/
	Similarity: 0.46316659450531006


Rampo	1994
	https://www.imdb.com/title/tt0110943/
	Similarity: 0.46307265758514404


American Dream	1990
	https://www.imdb.com/title/tt0099028/
	Similarity: 0.4624772071838379


Nayak: The Hero	1966
	https://www.imdb.com/title/tt0060742/
	Similarity: 0.4624177813529968


La Chinoise	1967
	https://www.imdb.com/title/tt0061473/
	Similarity: 0.46003857254981995


Not One Less	1999
	https://www.imdb.com/title/tt0209189/
	Similarity: 0.4561845362186432


A Short Film About Killing	1988
	https://www.imdb.com/title/tt0095468/
	Similarity: 0.45157304406166077


Sweetie	1989
	https://www.imdb.com/title/tt0098725/
	Similarity: 0.