# Recommender system

In [None]:
import pandas as pd
import numpy as np

movie_genres = pd.read_csv('movie_genres.csv', index_col=0)
user_reviews = pd.read_csv('user_reviews.csv', index_col=0)

movie_genres.head()

Unnamed: 0,movie_title,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,...,genre_mystery,genre_news,genre_reality-tv,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,The Net,1,0,0,0,0,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
1,Happily N'Ever After,0,1,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Tomorrowland,1,1,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
3,American Hero,1,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,Das Boot,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,1,0


In [None]:
user_reviews.head()

Unnamed: 0,User,The Net,Happily N'Ever After,Tomorrowland,American Hero,Das Boot,Final Destination 3,Licence to Kill,The Hundred-Foot Journey,The Matrix,...,The Martian,Micmacs,Solomon and Sheba,In the Company of Men,Silent House,Big Fish,Get Real,Trading Places,DOA: Dead or Alive,Hey Arnold! The Movie
0,Vincent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Edgar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Addilyn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Marlee,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Javier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Baseline

5 highest rated movies

In [None]:
reviews = user_reviews.replace(0,np.NaN)
reviews.mean().sort_values(ascending = False).head(10)


The Tempest                        5.000000
United 93                          5.000000
Edtv                               5.000000
Chill Factor                       4.909091
The Hunting Party                  4.900000
Blue Like Jazz                     4.800000
Perrier's Bounty                   4.789474
Never Back Down 2: The Beatdown    4.750000
The Death and Life of Bobby Z      4.750000
Highlander: Endgame                4.750000
dtype: float64

5 most watched movies

In [None]:
reviews.count(numeric_only = True).sort_values(ascending = False).head(10)

ATL                          20
Rang De Basanti              20
Observe and Report           20
Creepshow 2                  19
Perrier's Bounty             19
Furious 7                    19
Dysfunctional Friends        19
The Other End of the Line    19
Now You See Me 2             18
Killer Joe                   18
dtype: int64

## Content based (item-item) recommendation system

Good: 
- Learns user preference
- Simple model/ Quick to compute 

Bad: 
- Doesnt take other users (and therefore more data) into consideration. Therefore may have worse recomendations than other models
- Finding extensive features like the ones gives in our data sets may be hard in real life.
- Even if a movie is bad, and has a low rating it may get frequently recommended just because it has a lot of features(genres)

In [None]:
# Lets start with one user: Vincent
# Create a new table with only the movies Vincent have watched and transpose it
reviews = user_reviews.loc[user_reviews['User'] == 'Vincent'].copy()
reviews.drop('User', axis=1, inplace=True)

In [None]:
v_reviews = reviews.transpose()
v_reviews = v_reviews.reset_index()
v_reviews.columns = ['movie_title','rating']
v_reviews = v_reviews[v_reviews['rating'] != 0]
v_reviews.head()

Unnamed: 0,movie_title,rating
127,About Last Night,2.0
141,Shattered,3.0
151,Passchendaele,3.0
223,Broken Arrow,3.0
264,Songcatcher,4.0


In [None]:
# Create a table with the genres of only the movies Vincent have watched
v_genres = movie_genres[movie_genres['movie_title'].isin(v_reviews['movie_title'].tolist())]
v_genres.head()

Unnamed: 0,movie_title,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,...,genre_mystery,genre_news,genre_reality-tv,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
127,About Last Night,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
141,Shattered,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
151,Passchendaele,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
223,Broken Arrow,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
264,Songcatcher,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Calculate a score of what type of genres Vincent likes the most by taking the dot product of the two matricies. By multiplying each genre of a movie by the rating of that movie and then adding all the results for each genre we get a comparative score that can then be used to determine which genres Vincent prefers. From our results we can see that Vincent seems to love drama but also likes romance, action and thrillers.

In [None]:
v_genres.drop(['movie_title'], axis= 1, inplace=True)
v_pref = v_genres.transpose().dot(v_reviews['rating'])
v_pref.sort_values(ascending = False).head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


genre_drama       120.0
genre_romance      58.0
genre_action       51.0
genre_thriller     45.0
genre_comedy       35.0
dtype: float64

Next we multiply all movies by our preference vector and normalize back to our 1-5 rating scale the value and we get our final predicted rating for each movie. By doing this a movie that contain more genres that Vincent prefers should get a higher rating.

In [None]:
all_movies = movie_genres.drop(['movie_title'], axis= 1, inplace=False)
rec = (((all_movies*v_pref).sum(axis=1))/v_pref.sum())*5

rec_dic = {
    "Vincents predicted ratings":rec,
    "Vincents recomended movies": movie_genres['movie_title']
}

rec_df = pd.concat(rec_dic, axis=1)
rec_df = rec_df.drop(index=v_reviews.index.tolist()) #remove movies Vincent has already seen
rec_df = rec_df.sort_values("Vincents predicted ratings", ascending = False)

#These would be the 5 recommendations for Vincent
rec_df.head(5)

Unnamed: 0,Vincents predicted ratings,Vincents recomended movies
1011,3.582418,Perrier's Bounty
1618,3.428571,Alpha and Omega 4: The Legend of the Saw Tooth...
1518,3.197802,Nowhere to Run
1152,3.197802,The Good Thief
1740,3.197802,Set It Off


The five movies we end up recommending Vincent are 
1. Perrier's Bounty
2. Alpha and Omega 4: The Legend of the Saw Toothed Cave
3. Nowhere to Run
4. The Good Thief
5. Set It Off

Instead of calculating the recomendations for all users we can with this model only calculate the ones we are actually interested in. Lets do that!

In [None]:
for user in ["Edgar", "Addilyn", "Marlee", "Javier"]:
    reviews = user_reviews.loc[user_reviews['User'] == user].copy()
    reviews.drop('User', axis=1, inplace=True)
    u_reviews = reviews.transpose()
    u_reviews = u_reviews.reset_index()
    u_reviews.columns = ['movie_title','rating']
    u_reviews = u_reviews[u_reviews['rating'] != 0]

    u_genres = movie_genres[movie_genres['movie_title'].isin(u_reviews['movie_title'].tolist())]
    
    u_genres.drop(['movie_title'], axis= 1, inplace=True)
    u_pref = u_genres.transpose().dot(u_reviews['rating'])
    print(user + "s prefered genres:")
    print(u_pref.sort_values(ascending=False).head(5))
    print("\n")

    rec = (((all_movies*u_pref).sum(axis=1))/u_pref.sum())*5

    rec_dic = {
        (user + "s pred rating"):rec,
        (user + "s recommended movies"): movie_genres['movie_title']
    }

    u_rec_df = pd.concat(rec_dic, axis=1)
    u_rec_df = u_rec_df.drop(index=u_reviews.index.tolist())
    u_rec_df = u_rec_df.sort_values((user + "s pred rating"), ascending = False)

    print(u_rec_df.head(5))
    print("-----------------------------------------------------------------------")


Edgars prefered genres:
genre_drama       87.0
genre_romance     34.0
genre_thriller    31.0
genre_comedy      22.0
genre_crime       18.0
dtype: float64


      Edgars pred rating                          Edgars recommended movies
1011            3.081571                                   Perrier's Bounty
1388            2.945619                                  The Hunting Party
757             2.900302                                        Harvard Man
1618            2.870091  Alpha and Omega 4: The Legend of the Saw Tooth...
1268            2.794562                                    West Side Story
-----------------------------------------------------------------------
Addilyns prefered genres:
genre_comedy      93.0
genre_drama       66.0
genre_thriller    31.0
genre_romance     29.0
genre_crime       26.0
dtype: float64


      Addilyns pred rating                        Addilyns recommended movies
1618              3.420428  Alpha and Omega 4: The Legend of the Saw Tooth...
10

### Collaborative filtering

Good: 
- Individualized recomendation
- Able to use data from several customers (unlike content based)
- Typically easier data to find 

Bad: 
- slower
- if we have few users model works poorly


#
This part will not be very generic, many hard coded parts as this is mostly a proof of concept.

Our idea for a collaborative filtering is to find similar users by the metric "users who has voted similarly on as many movies as possible" as the user we are considering. It's possible to build more on this idea and consider things like the number of movies both users have voted on, but for a more accurate model like that to be possible more data is neccesary. A good idea for this model would also be to add a more advanced similarity metric between movies and incorporate that into the model, but we choose to implement something simple because of the time constraint of the assignment.

For our case were filtering the 15 users who has voted the most similar to the movies the user we are looking at has.

In [None]:
import csv
sortedSimilarUsersAll = []
# Select first five users
for user in [1, 2, 3, 4, 5]:
    with open('user_reviews.csv', newline='') as f:
        reader = csv.reader(f)
        data = list(reader)

# Check how many of the same movies other users have watched and given a similar rating (+1/-1)
    similarUsers = []
    for i in range(1,601):
        isSimilar = 0 #Stores the nmb of movies where the rating has been similar for each user
        for j in range(2,2002):
            if(float(data[user][j]) > 0 and float(data[i][j]) > 0): # Checks that users have actually watched movie
                if (abs(float(data[user][j]) - float(data[i][j])) <= 1): 
                    isSimilar = isSimilar + 1
        similarUsers.append([data[i][1],isSimilar])

    sortedSimilarUsers = list(reversed(sorted(similarUsers,key=lambda x: x[1])))[0:15]
    sortedSimilarUsersAll.append(sortedSimilarUsers)

for user in sortedSimilarUsersAll: # Note that the user itself will always be the most similar user (which makes sense)
    print(user)

[['Vincent', 39], ['Beckett', 3], ['Liam', 3], ['Skylar', 3], ['Milan', 2], ['Maverick', 2], ['Arlo', 2], ['Isaac', 2], ['Emily', 2], ['Shelby', 2], ['Memphis', 2], ['Willa', 2], ['Lia', 2], ['Malakai', 2], ['Carly', 2]]
[['Edgar', 30], ['Theo', 3], ['Gavin', 2], ['Justin', 2], ['Andy', 2], ['Karson', 2], ['Kameron', 2], ['Peyton', 2], ['Everly', 2], ['Mikayla', 2], ['Maci', 1], ['Jaylen', 1], ['Kingston', 1], ['Kian', 1], ['Bella', 1]]
[['Addilyn', 36], ['Cade', 3], ['Brayden', 3], ['Zayne', 3], ['Malik', 3], ['Journey', 2], ['Alyssa', 2], ['Zachary', 2], ['Rylan', 2], ['Kash', 2], ['Zoe', 2], ['Marco', 2], ['Maeve', 2], ['Lena', 2], ['Kaia', 2]]
[['Marlee', 32], ['Amari', 2], ['Magnolia', 2], ['Dallas', 2], ['Allie', 2], ['Graham', 2], ['Liam', 2], ['Serena', 2], ['Jaxton', 2], ['Tessa', 2], ['Cesar', 2], ['Gabriela', 2], ['Lola', 2], ['Rylee', 2], ['Malik', 2]]
[['Javier', 28], ['Dalton', 3], ['Lana', 3], ['Nora', 2], ['Fabian', 2], ['Josue', 2], ['Theodore', 2], ['Addison', 2], ['J

#
To work with less data we extract the list of movie ratings for the users most similar to the user we are creating a collaborative filtering for.

In [None]:
similarUsersAllRatedMovies = []
for sortedSimilarUsers in sortedSimilarUsersAll:
    similarUsersFullList = []
    for i in range(1,601):
        for x in sortedSimilarUsers:
            if x[0] == data[i][1]:
                similarUsersFullList.append(data[i])
    similarUsersAllRatedMovies.append(similarUsersFullList)

for user in similarUsersAllRatedMovies[0]:
    print(user)

['0', 'Vincent', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '2.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.

#
Now we simply add together the movies that our "similar users" has rated well. With the data we have we consider "well" to be: rated 4 or higher. We add all movie titles to a list, excluding movies that the user has already seen.

In [None]:
moviesToConsiderForEachUser = []
for similarUsersFullList in similarUsersAllRatedMovies:
    moviesToConsider = []
    for i in range(1,len(similarUsersFullList)):
        for j in range(2,2002):
            if float(similarUsersFullList[i][j]) >= 4 and float(similarUsersFullList[1][j]) == 0: #Checks so that its a new movie to the person we want to recommend to
                moviesToConsider.append(data[0][j])
    print(moviesToConsider)
    moviesToConsiderForEachUser.append(moviesToConsider)

['The Last Exorcism', 'Me You and Five Bucks', 'Thank You for Smoking', 'Labor Day', 'The Importance of Being Earnest', 'Brother', 'Bloodsport', 'Journey from the Fall', 'Random Hearts', 'Winnie the Pooh', 'Harry Potter and the Prisoner of Azkaban', 'The Hunting Party', 'Theresa Is a Mother', 'Sleepers', "Bridget Jones's Diary", 'The Dangerous Lives of Altar Boys', 'The Thirteenth Floor', 'Alex Rider: Operation Stormbreaker', 'A Sound of Thunder', 'Unbreakable', 'Armageddon', 'Wyatt Earp', 'After the Sunset', 'How to Deal', 'To Die For', 'Heaven Is for Real', 'Unbroken', 'Narc', 'Kung Fu Killer', 'The Game', 'Atlas Shrugged II: The Strike', 'Sex and the City 2', 'The Lady from Shanghai', 'Dolphin Tale 2', 'The Shining', 'Superman', 'Reign of Fire', 'Beyond the Valley of the Dolls', 'Contagion', 'Magic Mike XXL', 'Elysium', 'Mindhunters', 'Act of Valor', 'Deadline - U.S.A.', 'Lies in Plain Sight', 'Shopgirl', "It's Kind of a Funny Story", 'Redemption Road', 'Play It to the Bone', 'ATL',

Now all that is left to do is just count the movie titles and sort them by most common.

In [None]:
from collections import Counter
moviesCountedForEachUser = []
for moviesToConsider in moviesToConsiderForEachUser:
    res = Counter(moviesToConsider)
    res = res.most_common()
    moviesCountedForEachUser.append(res)

for i in range(0,5):
    print(data[i+1][1] + ":")
    print(moviesCountedForEachUser[i][0:5])

Vincent:
[('The Dangerous Lives of Altar Boys', 3), ('The Importance of Being Earnest', 2), ('Bloodsport', 2), ("Bridget Jones's Diary", 2), ('Alex Rider: Operation Stormbreaker', 2)]
Edgar:
[('Next Stop Wonderland', 2), ('Adventureland', 2), ('Someone Like You...', 2), ('Out of Sight', 2), ('New York, New York', 2)]
Addilyn:
[('Maid in Manhattan', 5), ('ATL', 3), ('Dysfunctional Friends', 3), ('Doctor Zhivago', 2), ('Larry the Cable Guy: Health Inspector', 2)]
Marlee:
[('Max Payne', 3), ('Bats', 2), ('Funny People', 2), ('Eden', 2), ('Shall We Dance', 2)]
Javier:
[('George and the Dragon', 3), ('Welcome to the Dollhouse', 2), ('Bamboozled', 2), ('My Favorite Martian', 2), ('Cheap Thrills', 2)]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2b45d5e8-0477-4f1d-9982-df41ed4c0283' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>