# NMF and Movie Ratings

In [3]:
import os
import json
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import NMF
from helpers.metrics import *

from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine

In [4]:
MV_users = pd.read_csv('./data/movies/users.csv')
MV_movies = pd.read_csv('./data/movies/movies.csv')
train = pd.read_csv('./data/movies/train.csv')
test = pd.read_csv('./data/movies/test.csv')

## Section 1

In [5]:
train

Unnamed: 0,uID,mID,rating
0,744,1210,5
1,3040,1584,4
2,1451,1293,5
3,5455,3176,2
4,2507,3074,5
...,...,...,...
700141,1184,2916,3
700142,137,1372,5
700143,195,2514,3
700144,1676,2566,3


In [8]:
# From CSCA 5632 Week 3 Lab 
def jaccard_sparse(mat):
    intersection = mat.dot(mat.T)
    
    row_sums = mat.sum(axis=1).A1

    union = row_sums[:, None] + row_sums[None, :] - intersection.toarray()
    
    jaccard_sim = intersection.toarray() / union
    jaccard_sim[union == 0] = 0.0

    return jaccard_sim

# (⊙_☉) (°~°) (•_•?)
def jaccard_sparse_multi(original, multi_input):
    intersection = (multi_input[0].dot(multi_input[0].T)).toarray()
    for mat in multi_input[1:]:
        to_add = (mat.dot(mat.T)).toarray()
        intersection += to_add
        
    row_sums = original.getnnz(axis=1)
    augmented = np.tile(row_sums, (row_sums.shape[0], 1))
    union = augmented + augmented.T - intersection
    
    jaccard_sim = intersection / union

    return jaccard_sim

def cos_sparse(mat):
    dot_product = mat.dot(mat.T).toarray()
    norms = np.sqrt(mat.multiply(mat).sum(axis=1))
    denom = norms.dot(norms.T)
    cos_sim = dot_product / denom
    cos_sim = np.nan_to_num(cos_sim, nan=0.0)
    cos_sim = 0.5 * cos_sim + 0.5
    np.fill_diagonal(cos_sim, 1.0)
    return np.array(cos_sim)

class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())


    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        # your code here
        toReturn  = np.copy(self.data.test)
        return np.take((3,3,3,3,3,3), np.array(self.data.test.rating))
        
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
#         combined = pd.concat([self.data.train, self.data.test])
        combined = self.data.train
        lookup = {}
        for uID, mID, rating in zip(combined.uID, combined.mID, combined.rating):
            if rating == 0:
                continue
            if uID not in lookup:
                lookup[uID] = [rating, 1]
            else:
                lookup[uID][0] += rating
                lookup[uID][1] += 1
        
        toReturn = []
        for uID in self.data.test.uID:
            if uID not in lookup:
                toReturn.append(3)
                continue
            toReturn.append(lookup[uID][0]/lookup[uID][1])
        
        return np.array(toReturn)
    
    def predict_from_sim(self,uids,mids):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        # your code here
        is_single = np.isscalar(uids) and np.isscalar(mids)
        
        if is_single:
            uids = np.array([uids])
            mids = np.array([mids])
            
        predicted_ratings = np.zeros(len(uids), dtype=np.float32)
        
        for idx, (uid,mid) in enumerate(zip(uids, mids)):
            userEntry = self.Mr[self.uid2idx[uid]]
            movieEntry = self.sim[self.mid2idx[mid]]

            rated_mask = userEntry > 0
            
            total_weight = np.sum(movieEntry[rated_mask])
            cumulative_rating = np.dot(movieEntry[rated_mask], userEntry[rated_mask])

            predicted_ratings[idx] = cumulative_rating / total_weight
                
        return predicted_ratings[0] if is_single else predicted_ratings
    
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        return self.predict_from_sim(self.data.test.uID, self.data.test.mID)
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

    
class ContentBased(RecSys):
    def __init__(self,data):
        super().__init__(data)
        self.data=data
        self.Mm = self.calc_movie_feature_matrix()  
        
    def calc_movie_feature_matrix(self):
        """
        Create movie feature matrix in a numpy array of shape (#allmovies, #genres) 
        """
        # your code here
        
        return csr_matrix(np.array(self.data.movies[self.data.movies.columns.drop(['mID', 'title', 'year'])]))
    
    def calc_item_item_similarity(self):
        """
        Create item-item similarity using Jaccard similarity
        """
        # Update the sim matrix by calculating item-item similarity using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B| 
        # your code here

        self.sim = jaccard_sparse(self.Mm)
                
class Collaborative(RecSys):    
    def __init__(self,data):
        super().__init__(data)
        
    def calc_item_item_similarity(self, simfunction, *X):  
        """
        Create item-item similarity using similarity function. 
        X is an optional transformed matrix of Mr
        """    
        # General function that calculates item-item similarity based on the sim function and data inputed
        if len(X)==0:
            self.sim = simfunction()            
        else:
            self.sim = simfunction(X[0]) # *X passes in a tuple format of (X,), to X[0] will be the actual transformed matrix
            
    def cossim(self):    
        """
        Calculates item-item similarity for all pairs of items using cosine similarity (values from 0 to 1) on utility matrix
        Returns a cosine similarity matrix of size (#all movies, #all movies)
        """
        # Return a sim matrix by calculating item-item similarity for all pairs of items using Jaccard similarity
        # Cosine Similarity: C(A, B) = (A.B) / (||A||.||B||) 
        # your code here
        
        num_users = self.Mr.shape[0]
        num_items = self.Mr.shape[1]
        
        entries = np.zeros((num_users, num_items), dtype=np.float32)
        
        for i, userEntry in enumerate(self.Mr):
            mask = userEntry > 0
            cumulative_rating = userEntry[mask].sum()
            count = mask.sum()
            
            avg = cumulative_rating / count if count > 0 else 0
            
            newEntry = userEntry.astype(float)
            newEntry[userEntry == 0] = avg
            newEntry -= avg
            
            entries[i] = newEntry
            
        item_item_mat = csr_matrix(entries.T)
        return cos_sparse(item_item_mat)
    
    def jacsim(self,Xr):
        """
        Calculates item-item similarity for all pairs of items using jaccard similarity (values from 0 to 1)
        Xr is the transformed rating matrix.
        """    
        # Return a sim matrix by calculating item-item similarity for all pairs of items using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B| 
        # your code here
        
        multi_input = []
        for i in range(1, 6):
            toAdd = Xr.T.astype(int)
            toAdd[toAdd!=i] = 0
            toAdd = csr_matrix((toAdd > 0).astype(int))
            multi_input.append(toAdd)
        
        original = Xr.T.astype(int)
        original = (original > 0).astype(int)
        original = csr_matrix(original)
        
        return jaccard_sparse_multi(original, multi_input)
    
    

In [10]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)



## Section 2