# Word2Vec v2: "Mistake Not"

### Connect to Database

In [1]:
! pip3 install psycopg2-binary --user
import pandas as pd
import psycopg2
import numpy as np
from getpass import getpass

[33mYou are using pip version 19.0.2, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


  """)


In [2]:
# connect to database
connection = psycopg2.connect(
    database  = "postgres",
    user      = "postgres",
    password  = getpass(),
    host      = "movie-rec-scrape.cvslmiksgnix.us-east-1.rds.amazonaws.com",
    port      = '5432'
)
# Enter database password below and press Enter.

 ················


In [3]:
# create cursor that is used throughout
try:
    c = connection.cursor()
    print("Connected!")
except:
    print("Connection problem chief!")

Connected!


### Prepare data and train.
1. Get the list of reviewers whose reviews we want (about 17k)
2. Get the dataframe of reviewers, movie IDs with positive reviews
3. Inner join the above two dataframes.
4. Run the list constructor on the join table to construct the training data.
    - Training data is of this format: [['movieid1', 'movieid2', ...], ...]
5. Train Word2Vec on the list of watch histories (which are themselves lists of movie IDs).
6. Save the model.

In [72]:
# Get reviewers with at least 10 positive reviews (rating 7-10 inclusive)
c.execute("""
select username
from reviews
where user_rating between 7 and 10
group by username
having count(username) >= 10
order by count(username) desc
""")

reviewers = c.fetchall()

In [74]:
# Get positive reviews from database
c.execute("SELECT movie_id, username FROM reviews WHERE user_rating > 6")
result = c.fetchall()

# create reviews dataframe
df = pd.DataFrame(result, columns = ['movieid', 'userid'])
df.head()

Unnamed: 0,movieid,userid
0,5493944,dmldc
1,95016,immortal_saint1
2,5493944,vampyr_vashti
3,5493944,julieclowes
4,5493944,stephgonser


In [76]:
# create reviewers dataframe
df_reviewers = pd.DataFrame(reviewers, columns = ['userid'])

In [77]:
# merge to get only the IDs relevant to training
df = df.merge(df_reviewers, how='inner', on='userid')
df.shape

(904140, 2)

In [22]:
# ! sudo su
# ! yum update -y
# ! yum -y install python-pip
# ! python -V

Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper,
              : versionlock
You need to be root to perform this command.
Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper,
              : versionlock
You need to be root to perform this command.
Python 3.6.5 :: Anaconda, Inc.


In [23]:
# ! which pip

/home/ec2-user/anaconda3/envs/python3/bin/pip


In [12]:
! python -m pip install tqdm
# ! python -c 'import tqdm'
! python -m pip install gensim

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m
Collecting tqdm
  Using cached https://files.pythonhosted.org/packages/72/c9/7fc20feac72e79032a7c8138fd0d395dc6d8812b5b9edf53c3afd0b31017/tqdm-4.41.1-py2.py3-none-any.whl
Installing collected packages: tqdm
Successfully installed tqdm-4.41.1
[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m
Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/ec/db/d0c6edd6e7211e7c47404034ed9dd71032a0a77c6ae8835505f1bd176a55/gensim-3.8.1-cp27-cp27mu-manylinux1_x86_64.whl (24.2MB)
[K    100% |████████████████████████████████| 24.2MB 1.2MB/s eta 0:00:01
Collecting smart-open>=1.8.1 (from gens

In [13]:
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [94]:
# list to capture watch history of the users
watched_train = []

# populate the list with the movie codes
for i in tqdm(reviewers):
    temp = df[df["userid"] == i[0]]["movieid"].tolist()
    watched_train.append(temp)
    
len(watched_train)

100%|██████████| 17812/17812 [44:07<00:00,  6.73it/s]


17812

In [95]:
# save the model for later
import pickle
pickle.dump(watched_train, open('watched_train.sav', 'wb'))

### Train the Model

**Important:** The previous model was trained on movie IDs that were inside lists of length 1, with watch histories being lists of lists.

This model eschews the inner lists. Each watch history is simply a list of strings.

In [9]:
# Run this to load the prepared data.
import pickle
watched_train = pickle.load(open('watched_train.sav', 'rb'))

In [14]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(watched_train, progress_per=200)

model.train(watched_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(8222420, 9041400)

In [15]:
# save word2vec model
model.save("w2v_mistakenot_py2.model")

### Test the model

In [98]:
# load model
import gensim
model = gensim.models.Word2Vec.load("w2v_mistakenot.model")

In [99]:
# prunes the model, making it faster but unable to train any more.
model.init_sims(replace=True)

In [16]:
print(model)

Word2Vec(vocab=24784, size=100, alpha=0.03)


In [101]:
# extract all vectors
X = model[model.wv.vocab]

X.shape

(24784, 100)

In [104]:
# IDs are words in the model, and callable as such.
model['0110912']

array([ 0.0424683 ,  0.08561097,  0.1003112 , -0.11106913, -0.06282448,
       -0.04704784,  0.01381051,  0.1464173 ,  0.02432096,  0.02893066,
        0.03427277,  0.19002798, -0.02404157, -0.01862836, -0.07317816,
       -0.15934035, -0.11867093,  0.04821193, -0.04854235, -0.00596277,
        0.10694741,  0.07178298, -0.12340666,  0.1441446 ,  0.10767256,
        0.00755085,  0.08707841,  0.1074426 , -0.01187609, -0.19023177,
        0.0950939 , -0.04520461, -0.13489398, -0.14325547,  0.05217481,
       -0.02633997, -0.04796528,  0.17709821, -0.1103778 ,  0.10097972,
       -0.0479805 ,  0.05608399,  0.00130781,  0.01643821,  0.1520071 ,
       -0.04711404,  0.1539092 , -0.02752153, -0.0437512 ,  0.02798683,
        0.06498379,  0.00445012, -0.0276325 ,  0.00323415, -0.00958083,
        0.05997658,  0.13095316, -0.12573294, -0.19757922, -0.07882729,
       -0.17327957, -0.09754732,  0.23031367, -0.07810653, -0.01805863,
        0.1136304 , -0.01263769,  0.0552126 , -0.02308759,  0.02

In [160]:
def get_title(id):
    """Takes an id string and returns the movie title."""
    
    try:
        c.execute(f"""
        select primary_title, start_year
        from movies
        where movie_id = '{id}'""")
    except:
        return f"Movie title unknown. ID:{id}"
    
    title = c.fetchone()
    return title
    
def predict(model, input, num_recs=6):
        """For the input, do the predictions and return them.

        Args:
            model: the word2vec model object.
            input: a list of movie IDs.
            num_recs: the number of recommendations to return.
        """

        def _aggregate_vectors(movies):
            # get the vector average of the movies in the input.
            # discard unrecognized IDs.
            movie_vec = []
            for i in movies:
                try:
                    movie_vec.append(model[i])
                except KeyError:
                    continue
            return np.mean(movie_vec, axis=0)

        def _similar_movies(v, n):
            # extract most similar movies for the input vector
            return model.similar_by_vector(v, topn= n+1)[1:]
        
        # aggregate input and find similar vectors.
        recs = _similar_movies(_aggregate_vectors(input), num_recs)
        # get titles
        recs = [get_title(y[0]) for y in recs] 
        return recs

In [164]:
# test cases

# A list of some Coen Bros movies.
coen_bros = ['116282', '2042568', '1019452', 
             '1403865', '190590', '138524', 
             '335245', '477348', '887883', '101410']

# Data scientist's recent watches.
cooper_recent = ['0053285', '0038650', '0046022', 
                 '4520988', '1605783', '6751668', 
                 '0083791', '0115685', '0051459', 
                 '8772262', '0061184', '0041959',
                 '7775622']

# dirkh public letterboxd recent watches.
dirkh = ['7975244', '8106534', '1489887', 
         '1302006', '7286456', '6751668', 
         '8364368', '2283362', '6146586', 
         '2194499', '7131622', '6857112']


In [165]:
predict(model=model, input=dirkh, num_recs=20)

[('The Day Shall Come', 2019),
 ('MFKZ', 2017),
 ('Us', 2019),
 ('The Death and Life of John F. Donovan', 2018),
 ('Jumanji: The Next Level', 2019),
 ('Booksmart', 2019),
 ('After', 2019),
 ('The Game Changers', 2018),
 ('Drunk Parents', 2019),
 ('Cake', 2018),
 ('Zombieland: Double Tap', 2019),
 ('Shazam!', 2019),
 ('Once Upon a Time... in Hollywood', 2019),
 ('Pavarotti', 2019),
 ('The King', 2019),
 ('Horrible Histories: The Movie - Rotten Romans', 2019),
 ('Green Book', 2018),
 ('The Last Boy', 2019),
 ('Extremely Wicked, Shockingly Evil and Vile', 2019),
 ('Glass', 2019)]