# Word2Vec v2.5: "Mistake Not"

## Connect to Database

In [46]:
! pip3 install psycopg2-binary --user
import pandas as pd
import psycopg2
import numpy as np
from getpass import getpass

[33mYou are using pip version 19.0.2, however version 20.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [47]:
# connect to database
connection = psycopg2.connect(
    database  = "postgres",
    user      = "postgres",
    password  = getpass(),
    host      = "movie-rec-scrape.cvslmiksgnix.us-east-1.rds.amazonaws.com",
    port      = '5432'
)
# Enter database password below and press Enter.

 ················


In [48]:
# create cursor that is used throughout
try:
    c = connection.cursor()
    print("Connected!")
except:
    print("Connection problem chief!")

Connected!


## Prepare data and train.
1. Get the list of reviewers whose reviews we want (about 17k)
2. Get the dataframe of reviewers, movie IDs with positive reviews
3. Inner join the above two dataframes.
4. Run the list constructor on the join table to construct the training data.
    - Training data is of this format: [['movieid1', 'movieid2', ...], ...]
5. Train Word2Vec on the list of watch histories (which are themselves lists of movie IDs).
6. Save the model.

In [59]:
# Get reviewers with at least 10 positive reviews (rating 8-10 inclusive)
c.execute("""
select username
from reviews
where user_rating between 8 and 10
group by username
having count(username) >= 10
order by count(username) desc
""")

'''
Minimum rating for training data has been increased to 8 stars in v3_LimitingFactor.

Explanation: v2_MistakeNot is returning movies with an average rating of 7.66, 
which is towards the low end of the distribution in the training data. It might be 
near the mean, but we want our model to give the user an above-average movie experience.
'''
reviewers = c.fetchall()

In [60]:
len(reviewers)

13641

In [61]:
# Get positive reviews from database
c.execute("SELECT movie_id, username FROM reviews WHERE user_rating BETWEEN 7 and 10")
result = c.fetchall()

'''
Training note: This query currently returns reviews in no discernible order.
This is because the reviews were inserted into the database by multiple scrapers
running in parallel.
Future users of this notebook should take care to note whether their database gives
the same result.
'''

# create reviews dataframe
df = pd.DataFrame(result, columns = ['movieid', 'userid'])
df.head()

Unnamed: 0,movieid,userid
0,60188,alexanderwilson45
1,60188,JohnHowardReid
2,60189,thedevilprobably
3,60196,manicmeuk
4,1727824,jacobtaylort


In [62]:
# create reviewers dataframe
df_reviewers = pd.DataFrame(reviewers, columns = ['userid'])

In [63]:
df_reviewers.shape

(13641, 1)

In [64]:
# merge to get only the IDs relevant to training
df = df.merge(df_reviewers, how='inner', on='userid')
df.shape

(766970, 2)

In [65]:
# ! sudo su
# ! yum update -y
# ! yum -y install python-pip
# ! python -V

In [66]:
# ! which pip

### Install gensim

In [55]:
! python -m pip install tqdm
# ! python -c 'import tqdm'
! python -m pip install gensim

[33mYou are using pip version 10.0.1, however version 20.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 10.0.1, however version 20.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [67]:
from tqdm import tqdm
# list to capture watch history of the users
watched_train = []

# populate the list with the movie codes
for i in tqdm(reviewers):
    temp = df[df["userid"] == i[0]]["movieid"].tolist()
    watched_train.append(temp)
    
len(watched_train)

100%|██████████| 13641/13641 [29:33<00:00,  7.69it/s]


13641

In [68]:
# save the model for later
import pickle
pickle.dump(watched_train, open('watched_train_3.sav', 'wb'))

In [166]:
# #save the model in protocol 2 so it can be opened in python 2.7
# import pickle
# temp = pickle.load(open('watched_train.sav', 'rb'))
# pickle.dump(temp, open('watched_train.sav', 'wb'), protocol=2)

## Train the Model

The first model was trained on movie IDs that were inside lists of length 1, with watch histories being lists of lists.

This model eschews the inner lists. Each watch history is simply a list of strings.

In [69]:
import pickle
watched_train = pickle.load(open('watched_train_3.sav', 'rb'))
len(watched_train)

13641

In [70]:
import random
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline
import warnings;
warnings.filterwarnings('ignore')

# train word2vec model
model = Word2Vec(window = 10, # perhaps increase this
                 sg = 1, # sets to skip-gram
                 hs = 0, # must be set to 0 for negative sampling
                 negative = 10, # for negative sampling
                 ns_exponent = 0.5, # 0.5 in best results
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14,
                 sample = 0.0001 # 10^-4 in best results
                )

model.build_vocab(watched_train, progress_per=200)

model.train(watched_train, total_examples = model.corpus_count, 
            epochs=90, # best results set this 90-150
            report_delay=60, compute_loss=True)

(58221426, 69027300)

In [71]:
# save word2vec model
model.save("w2v_LimitingFactor_v2.model")

## Test the model

In [72]:
!pip install gensim

[33mYou are using pip version 10.0.1, however version 20.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [73]:
# load model

# model = gensim.models.Word2Vec.load("w2v_mistakenot.model") #make sure you specify the right model version.

### Split data

In [79]:
import gensim
import pandas as pd
model.init_sims(replace=True) # prunes the model, making it faster but unable to train any more.
ratings = pd.read_csv('ratings.csv') # import my Letterboxd ratings
ratings.head()
ratings = ratings.dropna(axis=0, subset=['Rating', 'Name', 'Year'])

# set thresholds for minimum "good" rating and maximum "bad rating"
good_threshold = 4
bad_threshold = 3
good = ratings[ratings['Rating'] >= good_threshold]
bad = ratings[ratings['Rating'] <= bad_threshold]

# good_from_4 = ratings[ratings['Rating'] >= good_threshold]
good_from_3p5 = ratings[ratings['Rating'] >= 3.5]

# get watchlist for scoring the model
watchlist = pd.read_csv('watchlist.csv').dropna(axis=0, subset=['Name', 'Year'])

# get watch history sans ratings
full_history = pd.read_csv('watched.csv').dropna(axis=0, subset=['Name', 'Year'])

def df_to_id_list(df):
    """
    Input: dataframe of movies from my ratings.csv Letterboxd export
    
    Output: List of the IDs for those movies, ready for inferencing.
    """
    ids = []
    names = df.Name.tolist()
    names = [x.replace("'", "") for x in names]
    years = df.Year.tolist()
    years = [int(year) for year in years]
    info = list(zip(names, years))
    for i, j in info:
        try:
            c.execute(f"""
                SELECT movie_id
                FROM movies
                WHERE primary_title ILIKE '{i}' AND start_year = {j}
                ORDER BY runtime_minutes DESC
                LIMIT 1""")
            id = c.fetchone()[0]
            ids.append(id)
        except:
            continue
    return ids



# get ids from all lists, Dr. Seuss style
good_list = df_to_id_list(good)
bad_list = df_to_id_list(bad)
print("good and bad done")
val_list = df_to_id_list(watchlist)
hist_list = df_to_id_list(full_history)
print("histlist done")
good_3p5_list = df_to_id_list(good_from_3p5)
good_list_long = good_list[:int(len(good_list)/2)]
good_list_short = good_list[int(len(good_list)/2):]

print(len(good_list), len(bad_list), len(val_list), len(hist_list), len(good_3p5_list))

377 179 1007 1095 519


### Define inferencing functions

In [118]:
class ScoringService(object):
    model = None                # Where we keep the model when it's loaded
    
    @classmethod
    def get_model(cls):
        """Get the model object for this instance, loading it if it's not already loaded."""
        if cls.model == None:
            # load the gensim model
            w2v_model = gensim.models.Word2Vec.load("w2v_limitingfactor_v2.model")
            # keep only the normalized vectors.
            # This saves memory but makes the model untrainable (read-only).
            w2v_model.init_sims(replace=True)
            # with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'r') as inp:
            #     cls.model = pickle.load(inp)
            cls.model = w2v_model
        return cls.model

    @classmethod
    def predict(cls, input, bad_movies=[], n=20, harshness=1, rec_movies=True, show_vibes=False, scoring=False):
        """Returns a list of recommendations and useful metadata, given a pretrained 
        word2vec model and a list of movies.

        Args:
            cls (.model object): The pretrained word2vec model.
            
            input (list of strings): The list of movies that the user likes.
            
            bad_movies (list of strings): The list of movies that the user dislikes.
            
            n (int): The number of recommendations to return.
            
        Output: A list of tuples: Title, Year, IMDb URL, Similarity score.
        """
        
        # get pretrained model
        clf = cls.get_model()
        
        # list for storing duplicates
        dupes = []

        def _aggregate_vectors(movies):
            """Gets the vector average of a list of movies."""
            movie_vec = []
            for i in movies:
                try:
                    movie_vec.append(clf[i]) # get the vector for each movie
                except KeyError:
                    continue
            return np.mean(movie_vec, axis=0)

        def _similar_movies(v, bad_movies=[], n = 10):
            """Takes aggregated vector of good movies, 
            and optionally, a list of disliked movies.
            Subtracts disliked movies.
            Returns most similar movies for the input vector
            
            n: number of recommendations to return."""
            if bad_movies:
                v = _remove_dislikes(bad_movies, v, input=input, harshness=harshness)
            return clf.similar_by_vector(v, topn= n+1)[1:]
            
        def _remove_dupes(recs, input, bad_movies):
            """remove any recommended IDs that were in the input list"""
            all_seen = input + bad_movies
            # if there is a full watch history list, use it to remove dupes
            if hist_list:
                all_seen = list(set(all_seen+hist_list))
            nonlocal dupes
            dupes = [x for x in recs if x[0] in all_seen]
            return [x for x in recs if x[0] not in all_seen]

        def _get_info(id):
            """Takes an id string and returns the movie info with a url."""
            try:
                c.execute(f"""
                select m.primary_title, m.start_year, r.average_rating, r.num_votes
                from movies m
                join ratings r on m.movie_id = r.movie_id
                where m.movie_id = '{id[0]}'""")
            except:
                return tuple([f"Movie title unknown. ID:{id[0]}", None, None, None, None, None])

            t = c.fetchone()
            if t:
                title = tuple([t[0], t[1], f"https://www.imdb.com/title/tt{id[0]}/", t[2], t[3], id[1]])
                return title
            else:
                return tuple([f"Movie title unknown. ID:{id[0]}", None, None, None, None, None])
            
                
        def _remove_dislikes(bad_movies, good_movies_vec, input=1, harshness=1):
            """Takes a list of movies that the user dislikes. 
            Their embeddings are averaged,
            and subtracted from the input."""
            bad_vec = _aggregate_vectors(bad_movies)
            bad_vec = bad_vec / harshness
            return good_movies_vec - bad_vec
        
        def _score_model(recs, val_list):
            ids = [x[0] for x in recs]
            return len(list(set(ids) & set(val_list)))

        aggregated = _aggregate_vectors(input)
        formatted_recs = []
        testable_number = 0
        while len(formatted_recs) < n:
            testable_number += n
            recs = _similar_movies(aggregated, bad_movies, n=testable_number)
            recs = _remove_dupes(recs, input, bad_movies)
            formatted_recs = [_get_info(x) for x in recs]
            print(len(formatted_recs))
        
        if scoring and val_list:
            print(f"The model recommended {_score_model(recs, val_list)} movies that were on the watchlist!\n")
            print(f"\t\t Average Rating: {sum([i[3] for i in formatted_recs if i[3] is not None])/len(formatted_recs)}\n")
        if show_vibes:
            print("Movies you're likely to vibe with people on: \n")
            for x in dupes:
                print(_get_info(x))
            print('\n')
        if rec_movies:
            return formatted_recs

### Add test cases

In [119]:
# test cases

# A list of some Coen Bros movies.
coen_bros = ['116282', '2042568', '1019452', 
             '1403865', '190590', '138524', 
             '335245', '477348', '887883', '101410']

# Data scientist's recent watches.
cooper_recent = ['0053285', '0038650', '0046022', 
                 '4520988', '1605783', '6751668', 
                 '0083791', '0115685', '0051459', 
                 '8772262', '0061184', '0041959',
                 '7775622']

# dirkh public letterboxd recent watches.
dirkh = ['7975244', '8106534', '1489887', 
         '1302006', '7286456', '6751668', 
         '8364368', '2283362', '6146586', 
         '2194499', '7131622', '6857112']

# Marvin watches
marvin = ['7286456', '0816692', '2543164', '2935510', 
          '2798920', '0468569', '5013056', '1375666', 
          '3659388', '0470752', '0266915', '0092675', 
          '0137523', '0133093', '1285016']  

# Gabe watches
gabe = ['6292852','0816692','2737304','3748528',
        '3065204','4154796','1536537','1825683',
        '1375666','8236336','2488496','1772341',
        '0317705','6857112','5052448']

# Eric watches
eric = ['2974050','1595842','0118539','0093405',
        '3216920','1256535','5612742','3120314',
        '1893371','0046248','0058548','0199481',
        '2296777','0071198','0077834']

chuckie = ['4263482',
'0084787',
'3286052',
'5715874',
'1172994',
'4805316',
'3139756',
'8772262',
'7784604',
'1034415',]

harlan = ['1065073','5052448','0470752','5688932','1853728','1596363','0432283','6412452','4633694','9495224','0443453','0063823',
          '0066921','0405296','1130884','1179933','0120630','0268126','0137523','0374900','8772262','0116996','0107290','7339248']

ryan = ['0166924','2866360','0050825','2798920','3416742','0060827','1817273','0338013','0482571','5715874','2316411','4550098']

karyn = ['4425200','0464141','1465522','0093779','0099810','0076759','3748528','6763664','0317740','2798920','0096283','0258463','0118799','0058092','0107290','0045152','0106364']

richard = ['0074119','0064115','0070735','0080474','0061512','0067774','0057115','0070511','0081283',
           '0065126','0068421','0078227','0079100','0078966','0081696','0082085','0072431','0075784',
           '0093640','0098051','0094226','0097576','0099810','0081633','0080761','0077975','0085244','0095159','0101969']

joe = ['6335734','0291350','0113568','0208502','0169858','0095327','0097814','0983213','0094625','7089878']

lena = ['1990314','3236120','1816518','0241527','0097757','0268978','0467406','2543164','2245084','3741834']

wade = ['0118665','0270846','0288441','2287250','2287238','8668804','9448868','1702443','1608290','5519340']

### Try test cases

In [120]:
s = ScoringService()

# define many parameter sets

# possible good inputs: cooper_recent (13 most recent), good_list(my 4-5 star ratings), good_3p5_list (my 3.5-5 star ratings)
# possible bad_movies: [] (not removing dislikes), bad_list(my ratings 3 and below), karyn (individual whose taste markedly differs from mine)
# possible harshness: 1, 2, 3 (harshness 1 strongly removes disliked movies, while harshness 3 removes with a third of the strength)

params = [
    (cooper_recent, [], 1), #1
    (cooper_recent, bad_list, 1),
    (cooper_recent, bad_list, 2),
    (cooper_recent, bad_list, 3),
    (good_list, [], 1), #5
    (good_list, bad_list, 1),
    (good_list, bad_list, 2),
    (good_list, bad_list, 3),
    (good_3p5_list, [], 1),
    (good_3p5_list, bad_list, 1), #10
    (good_3p5_list, bad_list, 2),
    (good_3p5_list, bad_list, 3),
    (cooper_recent, karyn, 1),
    (good_list, karyn, 1),
    (good_3p5_list, karyn, 1), #15
]

count = 0
# Test all parameter sets
for i, j, k in params:
    count+=1
    print(count, "\t")
    s.predict(input=i, bad_movies=j, n=20, harshness=k, rec_movies=True, show_vibes=False, scoring=True)
    
# Good performance for v2MistakeNot: 6, 7, 11
# Good performance for v3LimitingFactor: 3, 4, 6, 10, 14, 15

# Setting 6 seems robust to significant hyperparameter changes.

1 	
13
24
The model recommended 9 movies that were on the watchlist!

		 Average Rating: 7.6125

2 	
14
31
The model recommended 8 movies that were on the watchlist!

		 Average Rating: 7.4483870967741925

3 	
13
28
The model recommended 8 movies that were on the watchlist!

		 Average Rating: 7.535714285714286

4 	
12
26
The model recommended 8 movies that were on the watchlist!

		 Average Rating: 7.630769230769231

5 	
1
4
6
9
15
22
The model recommended 10 movies that were on the watchlist!

		 Average Rating: 7.968181818181818

6 	
10
25
The model recommended 7 movies that were on the watchlist!

		 Average Rating: 7.587999999999999

7 	
0
5
10
19
27
The model recommended 10 movies that were on the watchlist!

		 Average Rating: 7.862962962962962

8 	
0
3
9
12
20
The model recommended 7 movies that were on the watchlist!

		 Average Rating: 7.970000000000001

9 	
2
3
5
8
13
17
24
The model recommended 7 movies that were on the watchlist!

		 Average Rating: 7.866666666666668

10 	

## Best settings for LimitingFactor_v1

In [38]:
# focus on the best settings for LimitingFactor_v1
params = [
    (cooper_recent, bad_list, 2), #3
    (cooper_recent, bad_list, 3), #4
    (good_list, bad_list, 1), #6
    (good_3p5_list, bad_list, 1), #10
    (good_list, karyn, 1), #14
    (good_3p5_list, karyn, 1), #15
]

count = 0
settings = [3, 4, 6, 10, 14, 15]
# Test all the promising parameter sets
for i, j, k in params:
    print(settings[count], "\t")
    prediction = s.predict(input=i, bad_movies=j, n=100, harshness=k, rec_movies=True, show_vibes=False, scoring=True)
    for i in prediction:
        print(f"{i[0]}\t{i[1]}\n\t{i[2]}\n\tAvg. Rating: {i[3]}\n\t# Votes: {i[4]}\n\tSimilarity: {i[5]}\n\n")
    count+=1

"""
Settings graded out of 100:
3... 91
4... 86
6... 89 (lots of old stuff)
10.. 90
14.. 91
15.. 90

I'm inclined to pronounce model LimitingFactor_v1 a success, because it gives 
diverse and high-quality results in more cases than its predecessor.
In 6 cases it gives 20+ watchlisted movies, where before we only got 3.

In other words, it gives good results in cases where input data may be rich or sparse.

"""


3 	
The model recommended 23 movies that were on the watchlist!

		 Average Rating: 7.444999999999996

Marriage Story	2019
	https://www.imdb.com/title/tt7653254/
	Avg. Rating: 8.1
	# Votes: 130590
	Similarity: 0.6401818990707397


Joker	2019
	https://www.imdb.com/title/tt7286456/
	Avg. Rating: 8.6
	# Votes: 620299
	Similarity: 0.6330293416976929


The Lighthouse	2019
	https://www.imdb.com/title/tt7984734/
	Avg. Rating: 7.8
	# Votes: 52091
	Similarity: 0.6307841539382935


Apollo 11	2019
	https://www.imdb.com/title/tt8760684/
	Avg. Rating: 8.2
	# Votes: 15085
	Similarity: 0.6201057434082031


Once Upon a Time... in Hollywood	2019
	https://www.imdb.com/title/tt7131622/
	Avg. Rating: 7.8
	# Votes: 342653
	Similarity: 0.6006509065628052


Knives Out	2019
	https://www.imdb.com/title/tt8946378/
	Avg. Rating: 8.0
	# Votes: 124017
	Similarity: 0.6003093719482422


A Star Is Born	2018
	https://www.imdb.com/title/tt1517451/
	Avg. Rating: 7.7
	# Votes: 293127
	Similarity: 0.5982445478439331


The

### Compare two test cases for v1

In [32]:
#Compare the unique recs of settings 7 and 11 from above.
#The param difference is that setting 7 uses movies rated 4 and above,
#while setting 11 uses movies rated 3.5 and above.

p4 = s.predict(input=good_list, bad_movies=bad_list, n=200, harshness=2, scoring=True)
p35 = s.predict(input=good_3p5_list, bad_movies=bad_list, n=200, harshness=2, scoring=True)

print(len(p4), len(p35))

p4_unique = [x for x in p4 if x[0] not in [x[0] for x in p35]]
p35_unique = [x for x in p35 if x[0] not in [x[0] for x in p4]]

print(len(p4_unique), len(p35_unique))


The model recommended 29 movies that were on the watchlist!

		 Average Rating: 7.786516853932587

The model recommended 27 movies that were on the watchlist!

		 Average Rating: 7.732142857142859

89 84
27 22


In [33]:
for i in p4_unique:
    print(f"{i[0]}\t{i[1]}\n\t{i[2]}")

Le parfum d'Yvonne	1994
	https://www.imdb.com/title/tt0110776/
Beautiful	1951
	https://www.imdb.com/title/tt0043332/
Journey to Italy	1954
	https://www.imdb.com/title/tt0046511/
A Story from Chikamatsu	1954
	https://www.imdb.com/title/tt0046851/
Manhattan	1979
	https://www.imdb.com/title/tt0079522/
The Circus	1928
	https://www.imdb.com/title/tt0018773/
Cold War	2018
	https://www.imdb.com/title/tt6543652/
Aguirre, the Wrath of God	1972
	https://www.imdb.com/title/tt0068182/
La Dolce Vita	1960
	https://www.imdb.com/title/tt0053779/
The Thin Blue Line	1988
	https://www.imdb.com/title/tt0096257/
Lancelot of the Lake	1974
	https://www.imdb.com/title/tt0071737/
Conversation Piece	1974
	https://www.imdb.com/title/tt0071585/
4 Months, 3 Weeks and 2 Days	2007
	https://www.imdb.com/title/tt1032846/
Romero	1989
	https://www.imdb.com/title/tt0098219/
Stagecoach	1939
	https://www.imdb.com/title/tt0031971/
Miller's Crossing	1990
	https://www.imdb.com/title/tt0100150/
Ankur: The Seedling	1974
	https:

In [212]:
for i in p35_unique:
    print(f"{i[0]}\t{i[1]}\n\t{i[2]}")

Once Upon a Time in High School: The Spirit of Jeet Kune Do	2004
	https://www.imdb.com/title/tt0390205/
Chillar Party	2011
	https://www.imdb.com/title/tt1841542/
Angel's Egg	1985
	https://www.imdb.com/title/tt0208502/
Unacknowledged	2017
	https://www.imdb.com/title/tt6400614/
Lootera	2013
	https://www.imdb.com/title/tt2224317/
Awe!	2018
	https://www.imdb.com/title/tt7797658/
Vada Chennai	2018
	https://www.imdb.com/title/tt5959980/
Aruvi	2016
	https://www.imdb.com/title/tt5867800/
Big Fish & Begonia	2016
	https://www.imdb.com/title/tt1920885/
Thithi	2015
	https://www.imdb.com/title/tt4881362/
Boy and the World	2013
	https://www.imdb.com/title/tt3183630/
Neon Genesis Evangelion: The End of Evangelion	1997
	https://www.imdb.com/title/tt0169858/
That Girl in Yellow Boots	2010
	https://www.imdb.com/title/tt1580704/
The Breath	2009
	https://www.imdb.com/title/tt1171701/
The Stupids	1996
	https://www.imdb.com/title/tt0117768/
To the Wonder	2012
	https://www.imdb.com/title/tt1595656/
The Meyer

### Examine ideal case

In [34]:
prediction = s.predict(input=good_list, bad_movies=bad_list, n=200, harshness=2)
for i in prediction:
    print(f"{i[0]}\t{i[1]}\n\t{i[2]}\n\tAvg. Rating: {i[3]}\n\t# Votes: {i[4]}\n\tSimilarity: {i[5]}\n\n")

One Flew Over the Cuckoo's Nest	1975
	https://www.imdb.com/title/tt0073486/
	Avg. Rating: 8.7
	# Votes: 860775
	Similarity: 0.637021541595459


The Devil, Probably	1977
	https://www.imdb.com/title/tt0075938/
	Avg. Rating: 7.3
	# Votes: 3150
	Similarity: 0.6019104719161987


Do the Right Thing	1989
	https://www.imdb.com/title/tt0097216/
	Avg. Rating: 7.9
	# Votes: 78577
	Similarity: 0.5995447635650635


Chinatown	1974
	https://www.imdb.com/title/tt0071315/
	Avg. Rating: 8.2
	# Votes: 275306
	Similarity: 0.5974133014678955


Sunset Blvd.	1950
	https://www.imdb.com/title/tt0043014/
	Avg. Rating: 8.4
	# Votes: 186742
	Similarity: 0.5961176753044128


On the Waterfront	1954
	https://www.imdb.com/title/tt0047296/
	Avg. Rating: 8.1
	# Votes: 132973
	Similarity: 0.5904337763786316


La Collectionneuse	1967
	https://www.imdb.com/title/tt0061495/
	Avg. Rating: 7.5
	# Votes: 5523
	Similarity: 0.5855226516723633


The Elephant Man	1980
	https://www.imdb.com/title/tt0080678/
	Avg. Rating: 8.1
	# Vo

## Best settings for v2

In [121]:
"""LimitingFactor_v2 is trained on movies rated 8 stars and above. The first thing I notice 
about it is that, using my movie history, whenever its recommendations get a high
average score, it strays farther an farther from my watchlist. So this is where it may become 
challenging to evaluate the model."""

# focus on the best settings for LimitingFactor_v2
params = [
    (good_3p5_list, bad_list, 2), #11
    (good_3p5_list, bad_list, 3)  #12
]

count = 0
settings = [11, 12]
# Test all the promising parameter sets
for i, j, k in params:
    print(settings[count], "\t")
    prediction = s.predict(input=i, bad_movies=j, n=100, harshness=k, rec_movies=True, show_vibes=False, scoring=True)
    for i in prediction:
        print(f"{i[0]}\t{i[1]}\n\t{i[2]}\n\tAvg. Rating: {i[3]}\n\t# Votes: {i[4]}\n\tSimilarity: {i[5]}\n\n")
    count+=1

"""
Settings graded out of 100:
11... 
12... 




"""


11 	
23
78
154
The model recommended 49 movies that were on the watchlist!

		 Average Rating: 7.586363636363638

One Flew Over the Cuckoo's Nest	1975
	https://www.imdb.com/title/tt0073486/
	Avg. Rating: 8.7
	# Votes: 860775
	Similarity: 0.6070456504821777


Schindler's List	1993
	https://www.imdb.com/title/tt0108052/
	Avg. Rating: 8.9
	# Votes: 1138968
	Similarity: 0.6057159900665283


78/52: Hitchcock's Shower Scene	2017
	https://www.imdb.com/title/tt4372240/
	Avg. Rating: 7.3
	# Votes: 2652
	Similarity: 0.5958279967308044


The Last Supper	1976
	https://www.imdb.com/title/tt0075363/
	Avg. Rating: 7.5
	# Votes: 518
	Similarity: 0.5952874422073364


Casino	1995
	https://www.imdb.com/title/tt0112641/
	Avg. Rating: 8.2
	# Votes: 432378
	Similarity: 0.5854454040527344


American Psycho	2000
	https://www.imdb.com/title/tt0144084/
	Avg. Rating: 7.6
	# Votes: 450780
	Similarity: 0.5853142142295837


Chinatown	1974
	https://www.imdb.com/title/tt0071315/
	Avg. Rating: 8.2
	# Votes: 275306
	Si

'\nSettings graded out of 100:\n11... \n12... \n\n\n\n\n'