In [66]:
# Script dependencies
import pandas as pd
import numpy as np
import pickle
import copy
from surprise import Reader, Dataset
from surprise import SVD, NormalPredictor, BaselineOnly, KNNBasic, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import streamlit as st

# Importing data
movies_df = pd.read_csv('../resources/data/movies.csv',sep = ',')
ratings_df = pd.read_csv('../resources/data/train.csv')
ratings_df.drop(['timestamp'], axis=1,inplace=True)


# We make use of an SVD model trained on a subset of the MovieLens 10k dataset.
model=pickle.load(open('../resources/models/model.pkl', 'rb'))

In [23]:
def prediction_item(item_id):
    """Map a given favourite movie to users within the
       MovieLens dataset with the same preference.
    Parameters
    ----------
    item_id : int
        A MovieLens Movie ID.
    Returns
    -------
    list
        User IDs of users with similar high ratings for the given movie.
    """
    # Data preprosessing
    reader = Reader(rating_scale=(0, 5))
    load_df = Dataset.load_from_df(ratings_df,reader)
    a_train = load_df.build_full_trainset()

    predictions = []
    for ui in a_train.all_users():
        predictions.append(model.predict(iid=item_id, uid=ui, verbose = False))
    return predictions

In [24]:
prediction_item(movies_df.loc[950, 'movieId'])

[Prediction(uid=0, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=1, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=2, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=3, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=4, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=5, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=6, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=7, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=8, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=9, iid=971, r_ui=None, est=3.449619940002398, details={'was_impossible': False}),
 Prediction(uid=10, 

In [25]:
def pred_movies(movie_list):
    """Maps the given favourite movies selected within the app to corresponding
    users within the MovieLens dataset.
    Parameters
    ----------
    movie_list : list
        Three favourite movies selected by the app user.
    Returns
    -------
    list
        User-ID's of users with similar high ratings for each movie.
    """
    # Store the id of users
    id_store=[]
    # For each movie selected by a user of the app,
    # predict a corresponding user within the dataset with the highest rating
    for i in movie_list:
        predictions = prediction_item(item_id = i)
        predictions.sort(key=lambda x: x.est, reverse=True)
        # Take the top 10 user id's from each movie with highest rankings
        for pred in predictions[:10]:
            id_store.append(pred.uid)
    # Return a list of user id's
    return id_store


In [38]:
pred_movies(movies_df.loc[100:102, 'title'])

[18551,
 21365,
 6118,
 44782,
 50739,
 31547,
 53150,
 39251,
 11315,
 3403,
 18551,
 21365,
 6118,
 44782,
 50739,
 31547,
 53150,
 39251,
 11315,
 3403,
 18551,
 21365,
 6118,
 44782,
 50739,
 31547,
 53150,
 39251,
 11315,
 3403]

In [67]:
def collab_model(movie_list,top_n=10):
    """Performs Collaborative filtering based upon a list of movies supplied
       by the app user.
    Parameters
    ----------
    movie_list : list (str)
        Favorite movies chosen by the app user.
    top_n : type
        Number of top recommendations to return to the user.
    Returns
    -------
    list (str)
        Titles of the top-n movie recommendations to the user.
    """
    indices = pd.Series(movies_df['title'])
    movie_ids = pred_movies(movie_list)
    
    df_init_users = ratings_df[ratings_df['userId'] == movie_ids[0]]
    
    for i in movie_ids :
        df_init_users = df_init_users.append(ratings_df[ratings_df['userId']==i])
    
    # Getting the cosine similarity matrix
    cosine_sim = cosine_similarity(np.array(df_init_users), np.array(df_init_users))
    
    idx_1 = indices[indices == list(movie_list)[0]].index % len(cosine_sim)
    idx_2 = indices[indices == list(movie_list)[1]].index % len(cosine_sim)
    idx_3 = indices[indices == list(movie_list)[2]].index % len(cosine_sim)
    
    # Creating a Series with the similarity scores in descending order
    rank_1 = cosine_sim[idx_1][0]
    rank_2 = cosine_sim[idx_2][0]
    rank_3 = cosine_sim[idx_3][0]
   
    # Calculating the scores
    score_series_1 = pd.Series(rank_1).sort_values(ascending = False)
    score_series_2 = pd.Series(rank_2).sort_values(ascending = False)
    score_series_3 = pd.Series(rank_3).sort_values(ascending = False)
    
     # Appending the names of movies
    listings = score_series_1.append(score_series_1).append(score_series_3).sort_values(ascending = False)
    
    recommended_movies = []
    # Choose top 50
    top_50_indexes = list(listings.iloc[1:50].index)

    # Removing chosen movies
    top_indexes = np.setdiff1d(top_50_indexes,[idx_1,idx_2,idx_3])
    for i in top_indexes[:top_n]:
        recommended_movies.append(list(movies_df['title'])[i])
        
    return recommended_movies 

In [68]:
collab_model(movies_df.loc[14930:14933, 'title'])

2022-07-29 14:41:35.140 
  command:

    streamlit run c:\users\user\appdata\local\programs\python\python37\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


["Singin' in the Rain (1952)",
 'Funny Face (1957)',
 "Breakfast at Tiffany's (1961)",
 'Vertigo (1958)',
 'Rear Window (1954)',
 'Hamlet (1948)',
 'Greatest Show on Earth, The (1952)',
 'From Here to Eternity (1953)',
 'On the Waterfront (1954)',
 'Marty (1955)']

In [37]:
collab_model(movies_df.loc[14000:14003, 'title'])

[18551 21365  6118 31547 53150]


In [13]:
movies_df.loc[14930:14933, 'title']

14930       So Proudly We Hail! (1943)
14931          31 North 62 East (2009)
14932             Irene in Time (2009)
14933    Captains of the Clouds (1942)
Name: title, dtype: object

In [14]:
ratings_df.shape

(100004, 3)

In [15]:
ratings_df.tail()

Unnamed: 0,userId,movieId,rating
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5
100003,671,6565,3.5
