In [1]:
# Script dependencies
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


# Importing data
movies = pd.read_csv('resources/data/movies.csv', sep = ',')
ratings = pd.read_csv('resources/data/ratings.csv')
movies.dropna(inplace=True)

def data_preprocessing(subset_size):
    """Prepare data for use within Content filtering algorithm.

    Parameters
    ----------
    subset_size : int
        Number of movies to use within the algorithm.

    Returns
    -------
    Pandas Dataframe
        Subset of movies selected for content-based filtering.

    """
    # Split genre data into individual words.
    movies['keyWords'] = movies['genres'].str.replace('|', ' ')
    # Subset of the data
    movies_subset = movies[:subset_size]
    return movies_subset

# !! DO NOT CHANGE THIS FUNCTION SIGNATURE !!
# You are, however, encouraged to change its content.  
def content_model(movie_list,top_n=10):
    """Performs Content filtering based upon a list of movies supplied
       by the app user.

    Parameters
    ----------
    movie_list : list (str)
        Favorite movies chosen by the app user.
    top_n : type
        Number of top recommendations to return to the user.

    Returns
    -------
    list (str)
        Titles of the top-n movie recommendations to the user.

    """
    # Initializing the empty list of recommended movies
    recommended_movies = []
    data = data_preprocessing(5000)
    
    # Instantiating and generating the count matrix
    count_vec = CountVectorizer()
    count_matrix = count_vec.fit_transform(data['keyWords'])
    indices = pd.Series(data.index, index=data['title'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    # Getting the index of the movie that matches the title
    idx_1 = indices[movie_list[0]]
    idx_2 = indices[movie_list[1]]
    idx_3 = indices[movie_list[2]]
    # Creating a Series with the similarity scores in descending order
    rank_1 = cosine_sim[idx_1]
    rank_2 = cosine_sim[idx_2]
    rank_3 = cosine_sim[idx_3]
    # Calculating the scores
    score_series_1 = pd.Series(rank_1).sort_values(ascending = False)
    score_series_2 = pd.Series(rank_2).sort_values(ascending = False)
    score_series_3 = pd.Series(rank_3).sort_values(ascending = False)
    # Getting the indexes of the 10 most similar movies
    listings = pd.concat([score_series_1, score_series_2, score_series_3]).sort_values(ascending=False)

    # Store movie names
    recommended_movies = []
    # Appending the names of movies
    top_50_indexes = list(listings.iloc[1:50].index)
    # Removing chosen movies
    top_indexes = np.setdiff1d(top_50_indexes,[idx_1,idx_2,idx_3])
    for i in top_indexes[:top_n]:
        recommended_movies.append(list(movies['title'])[i])
    return recommended_movies



In [6]:
fav_movies = ["Jumanji (1995)", "Father of the Bride Part II (1995)", "Rookie of the Year (1993)"]
content_model(movie_list=fav_movies, top_n=10)

['Indian in the Cupboard, The (1995)',
 'NeverEnding Story III, The (1994)',
 'Angus (1995)',
 'Cosi (1996)',
 'Mouth to Mouth (Boca a boca) (1995)',
 'Multiplicity (1996)',
 'Great White Hype, The (1996)',
 'Cold Comfort Farm (1995)',
 'Getting Away With Murder (1996)',
 'Honey Moon (Honigmond) (1996)']

In [3]:
movies[:5000]

Unnamed: 0,movieId,title,genres,keyWords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy
...,...,...,...,...
4995,5101,Richard Pryor Here and Now (1983),Comedy|Documentary,Comedy Documentary
4996,5102,Rookie of the Year (1993),Comedy|Fantasy,Comedy Fantasy
4997,5103,"Sandlot, The (1993)",Children|Comedy|Drama,Children Comedy Drama
4998,5104,Cows (Vacas) (1991),Drama,Drama


In [None]:
# Script dependencies
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import copy
from surprise import Reader, Dataset
from surprise import SVD, NormalPredictor, BaselineOnly, KNNBasic, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import operator # <-- Convienient item retrieval during iteration

# Importing data
movies_df = pd.read_csv('resources/data/movies.csv',sep = ',')
ratings_df = pd.read_csv('resources/data/ratings.csv')
ratings_df.drop(['timestamp'], axis=1,inplace=True)

# We make use of an SVD model trained on a subset of the MovieLens 10k dataset.
model = pickle.load(open('resources/models/SVD.pkl', 'rb'))

def preprocessing(movie, rating):
    # merge movie and rating to get the title column
    merged_rating = pd.merge(rating, movie[["movieId", "title"]], on='movieId')

    util_matrix = merged_rating.pivot_table(index=['userId'],
                                            columns=['title'],
                                            values='rating') 

    # Normalize each row (a given user's ratings) of the utility matrix
    util_matrix_norm = util_matrix.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
    # Fill Nan values with 0's, transpose matrix, and drop users with no ratings
    util_matrix_norm.fillna(0, inplace=True)
    util_matrix_norm = util_matrix_norm.T
    util_matrix_norm = util_matrix_norm.loc[:, (util_matrix_norm != 0).any(axis=0)]
    # Save the utility matrix in scipy's sparse matrix format
    util_matrix_sparse = sp.sparse.csr_matrix(util_matrix_norm.values)

    # Compute the similarity matrix using the cosine similarity metric
    user_similarity = cosine_similarity(util_matrix_sparse.T)
    # Save the matrix as a dataframe to allow for easier indexing  
    user_sim_df = pd.DataFrame(user_similarity, 
                            index = util_matrix_norm.columns, 
                            columns = util_matrix_norm.columns)
    return (merged_rating, util_matrix_norm, user_sim_df)


def prediction_item(item_id):
    """Map a given favourite movie to users within the
       MovieLens dataset with the same preference.

    Parameters
    ----------
    item_id : int
        A MovieLens Movie ID.

    Returns
    -------
    list
        User IDs of users with similar high ratings for the given movie.

    """
    # Data preprosessing
    reader = Reader(rating_scale=(0, 5))
    load_df = Dataset.load_from_df(ratings_df,reader)
    a_train = load_df.build_full_trainset()

    predictions = []
    for ui in a_train.all_users():
        predictions.append(model.predict(iid=item_id,uid=ui, verbose = False))
    return predictions

def pred_movies(movie_list):
    """Maps the given favourite movies selected within the app to corresponding
    users within the MovieLens dataset.

    Parameters
    ----------
    movie_list : list
        Three favourite movies selected by the app user.

    Returns
    -------
    list
        User-ID's of users with similar high ratings for each movie.

    """
    # Store the id of users
    id_store=[]
    # For each movie selected by a user of the app,
    # predict a corresponding user within the dataset with the highest rating
    for i in movie_list:
        predictions = prediction_item(item_id = i)
        predictions.sort(key=lambda x: x.est, reverse=True)
        # Take the top 10 user id's from each movie with highest rankings
        for pred in predictions[:10]:
            id_store.append(pred.uid)
    # Return a list of user id's
    return id_store

# !! DO NOT CHANGE THIS FUNCTION SIGNATURE !!
# You are, however, encouraged to change its content.  
def collab_model(movie_list, top_n=10):
    """Performs Collaborative filtering based upon a list of movies supplied
       by the app user.

    Parameters
    ----------
    movie_list : list (str)
        Favorite movies chosen by the app user.
    top_n : type
        Number of top recommendations to return to the user.

    Returns
    -------
    list (str)
        Titles of the top-n movie recommendations to the user.

    """
    
    # Initialize an empty list to store the recommendations for each user

    merged_rating, util_matrix_norm, user_sim_df = preprocessing(movies_df, ratings_df)

    all_recommendations = []
    
    movie_ids = pred_movies(movie_list)
    k = 20

    for user in movie_ids:
        # Cold-start problem - no ratings given by the reference user. 
        # With no further user data, we solve this by simply recommending
        # the top-N most popular books in the item catalog. 
        if user not in user_sim_df.columns:
            recommendations = merged_rating.groupby('title').mean().sort_values(by='rating',
                                                ascending=False).index[:top_n].to_list()
        else:
            # Gather the k users which are most similar to the reference user 
            sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
            favorite_user_items = [] # <-- List of highest rated items gathered from the k users  
            most_common_favorites = {} # <-- Dictionary of highest rated items in common for the k users

            for i in sim_users:
                # Maximum rating given by the current user to an item 
                max_score = util_matrix_norm.loc[:, i].max()
                # Save the names of items maximally rated by the current user   
                favorite_user_items.append(util_matrix_norm[util_matrix_norm.loc[:, i]==max_score].index.tolist())

            # Loop over each user's favorite items and tally which ones are 
            # most popular overall.
            for item_collection in range(len(favorite_user_items)):
                for item in favorite_user_items[item_collection]: 
                    if item in most_common_favorites:
                        most_common_favorites[item] += 1
                    else:
                        most_common_favorites[item] = 1
            # Sort the overall most popular items and return the top-N instances
            sorted_list = sorted(most_common_favorites.items(), key=operator.itemgetter(1), reverse=True)[:top_n]
            recommendations = [x[0] for x in sorted_list]
        all_recommendations.append(recommendations)
    
    # Flatten the list of recommendations for all users into a single list
    flattened_recommendations = [book for user_recommendations in all_recommendations for book in user_recommendations]
    
    # Tally the occurrences of each book in the list of recommendations
    book_counts = {}
    for book in flattened_recommendations:
        if book in book_counts:
            book_counts[book] += 1
        else:
            book_counts[book] = 1
    
    # Sort the dictionary of book counts by the occurrence counts and return the top-10 books
    sorted_books = sorted(book_counts.items(), key=operator.itemgetter(1), reverse=True)[:10]
    all_recommendations = [x[0] for x in sorted_books]
        
    return all_recommendations


In [None]:
fav_movies = ["Toy Story (1995)", "31 North 62 East (2009)", "American Psycho (2000)"]
collab_model(movie_list=fav_movies, top_n=10)

In [None]:
# ratings_df
merged_df = pd.merge(ratings_df, movies_df[["movieId", "title"]], on='movieId')
merged_df

In [None]:
# Streamlit dependencies
import streamlit as st

# Data handling dependencies
import pandas as pd
import numpy as np

# Custom Libraries
from utils.data_loader import load_movie_titles
from recommenders.collaborative_based import collab_model
from recommenders.content_based import content_model

# Data Loading
title_list = load_movie_titles('resources/data/movies.csv')

# App declaration
def main():

    # DO NOT REMOVE the 'Recommender System' option below, however,
    # you are welcome to add more options to enrich your app.
    page_options = ["Recommender System","Solution Overview"]

    # -------------------------------------------------------------------
    # ----------- !! THIS CODE MUST NOT BE ALTERED !! -------------------
    # -------------------------------------------------------------------
    page_selection = st.sidebar.selectbox("Choose Option", page_options)
    if page_selection == "Recommender System":
        # Header contents
        st.write('# Movie Recommender Engine')
        st.write('### EXPLORE Data Science Academy Unsupervised Predict')
        st.image('resources/imgs/Image_header.png',use_column_width=True)
        # Recommender System algorithm selection
        sys = st.radio("Select an algorithm",
                       ('Content Based Filtering',
                        'Collaborative Based Filtering'))

        # User-based preferences
        st.write('### Enter Your Three Favorite Movies')
        movie_1 = st.selectbox('Fisrt Option',title_list[14930:15200])
        movie_2 = st.selectbox('Second Option',title_list[25055:25255])
        movie_3 = st.selectbox('Third Option',title_list[21100:21200])
        fav_movies = [movie_1, movie_2, movie_3]

        # Perform top-10 movie recommendation generation
        if sys == 'Content Based Filtering':
            if st.button("Recommend"):
                try:
                    with st.spinner('Crunching the numbers...'):
                        top_recommendations = content_model(movie_list=fav_movies,
                                                            top_n=10)
                    st.title("We think you'll like:")
                    for i, j in enumerate(top_recommendations):
                        st.subheader(str(i+1)+'. '+j)
                except:
                    st.error("Oops! Looks like this algorithm does't work.\
                              We'll need to fix it!")


        if sys == 'Collaborative Based Filtering':
            if st.button("Recommend"):
                try:
                    with st.spinner('Crunching the numbers...'):
                        top_recommendations = collab_model(movie_list=fav_movies,
                                                           top_n=10)
                    st.title("We think you'll like:")
                    for i,j in enumerate(top_recommendations):
                        st.subheader(str(i+1)+'. '+j)
                except:
                    st.error("Oops! Looks like this algorithm does't work.\
                              We'll need to fix it!")


    # -------------------------------------------------------------------

    # ------------- SAFE FOR ALTERING/EXTENSION -------------------
    if page_selection == "Solution Overview":
        st.title("Solution Overview")
        st.write("Describe your winning approach on this page")

    # You may want to add more sections here for aspects such as an EDA,
    # or to provide your business pitch.


if __name__ == '__main__':
    main()
