In [None]:
# Import the libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import turicreate
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

In [None]:
# Load dataset
# Read movies.csv, ratings.csv, tags.csv
movies = pd.read_csv('../input/ml20mzip/ml-20m/movies.csv')
ratings = pd.read_csv('../input/ml20mzip/ml-20m/ratings.csv')
tags = pd.read_csv('../input/ml20mzip/ml-20m/tags.csv')

In [None]:
# Merging movies and tags
# Groupby movieId
merged_tags = tags[['movieId','tag']].groupby('movieId').agg({'tag': lambda x: ' '.join([str(x_) for x_ in x])})
movie_tag_new = pd.merge(movies, merged_tags, how="left", on="movieId")
movie_tag_new['tag'] = movie_tag_new['tag'].fillna(' ')

In [None]:
# Content filtering on Metadata
movie_tag_new['genres'].apply(lambda s: s.replace('|', ' '))
movie_tag_new['desc'] = movie_tag_new['genres'].apply(lambda s: s.replace('|', ' '))

In [None]:
# Create User Movie Matrix
N_ratings = len(ratings)
user_movies_data = ratings.iloc[:N_ratings//10].pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [None]:
# Select a subset of movieId based on 'user_movies_data'
movie_tag_new = movie_tag_new.loc[movie_tag_new.movieId.isin(user_movies_data.index)]

In [None]:
# TF-IDF Vectorizer on Metadata
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movie_tag_new['desc'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=movie_tag_new.index.tolist())

In [None]:
# TruncatedSVD for Content filter
# Create latent matrix 1
svd = TruncatedSVD(n_components=18)
latent_matrix_1 = svd.fit_transform(tfidf_df)
n = 19
latent_matrix_1_df = pd.DataFrame(latent_matrix_1[:, 0:n], index=movie_tag_new['title'].tolist())

In [None]:
# Collaborative filtering
# Create latent matrix 2
latent_matrix_2 = svd.fit_transform(user_movies_data)
latent_matrix_2_df = pd.DataFrame(latent_matrix_2, index=movies.loc[movies['movieId'].isin(user_movies_data.index)]['title'].tolist())

In [None]:
# Hybrid Recommendation System
def recommend_similar_movies(title):
    a_1 = np.array(latent_matrix_1_df.loc[title]).reshape(1, -1)
    a_2 = np.array(latent_matrix_2_df.loc[title]).reshape(1, -1)

    # calculate the similartity of this movie with the others in the list
    score_content = cosine_similarity(latent_matrix_1_df, a_1).reshape(-1)
    score_collab = cosine_similarity(latent_matrix_2_df, a_2).reshape(-1)

    # hybrid score: an average measure of both content and collaborative 
    hybrid_score = (score_content + score_collab) / 2

    # form a data frame of similar movies 
    dictDF = {'content': score_content, 'collab': score_collab, 'hybrid': hybrid_score}
    similar_movies = pd.DataFrame(dictDF, index=latent_matrix_2_df.index)

    #sort it on the basis of either: content, collaborative or hybrid, here : content
    similar_movies.sort_values('content', ascending=False, inplace=True)
    print(similar_movies)
    
    return dictDF

recommend_similar_movies(latent_matrix_1_df.index[0])

In [None]:
# Turicreate
# Popularity Recommender - Cold Start problem
# every new user will get the same recommendations
model = turicreate.popularity_recommender.create(ratings, user_id='userId', item_id='movieId', target='rating')

# Recommend some top 5 movies to users 1, 2, 3, 4, 5
pop_rec = model.recommend(users=[1,2,3,4,5], k=5)
pop_rec.print_rows(num_rows=25, num_columns=4)

In [None]:
# Item Similarity Recommender
# Making recommendations for users 1, 2, 3, 4, 5
sim_model = turicreate.item_similarity_recommender.create(ratings, user_id='userId', item_id='movieId', target='rating', similarity_type='cosine')
sim = sim_model.recommend(users=[1,2,3,4,5], k=5)
sim.print_rows(num_rows=25, num_columns=4)

In [None]:
# Matrix Factorization Recommender
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and moive j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [None]:
R= np.array(user_movies_data)
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()

In [None]:
# Surprise
N_ratings = len(ratings)
user_movies_data = ratings.iloc[:N_ratings//10].pivot(index='movieId', columns='userId', values='rating').fillna(0)

Mapping_file = dict(zip(movie_tag_new['title'].tolist(), movie_tag_new['movieId'].tolist()))
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)
svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)

In [None]:
def pred_user_rating(ui,algorithm = svd):
    if ui in ratings.userId.unique():
        ui_list = ratings[ratings.userId == ui].movieId.tolist()
        d = {k: v for k,v in Mapping_file.items() if not v in ui_list}        
        predictedL = []
        for i, j in d.items():     
            predicted = algorithm.predict(ui, j)
            predictedL.append((i, predicted[3])) 
        pdf = pd.DataFrame(predictedL, columns = ['movies', 'ratings'])
        pdf.sort_values('ratings', ascending=False, inplace=True)  
        pdf.set_index('movies', inplace=True)    
        return pdf.head(10)        
    else:
        print("User Id does not exist in the list!")
        return pred_user_rating
userId = 1
pred_user_rating(userId)