In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from scipy import linalg as la
from scipy import sparse
from pandas.api.types import CategoricalDtype
from scipy.sparse import linalg as spla

In [19]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path: path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
path = "/Users/Armen/Desktop/SpringDataProject/"
r = pd.read_csv(path+"ratings.csv",nrows=10000)

In [20]:
#get only those movies with more than 17 ratings (cutting out ~50% of movies)
#merged = r.merge(r.groupby("movieId").size().reset_index(name='count'), how='right', on='movieId')
#df = merged[merged["count"]>17].sort_values(['userId', 'movieId'])

#create a matrix from the ratings list
user_c = CategoricalDtype(sorted(r.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(r.movieId.unique()), ordered=True)
row = r.userId.astype(user_c).cat.codes
col = r.movieId.astype(movie_c).cat.codes

#this matrix is movies x users
sparse_matrix = sparse.csr_matrix((r['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

#we can't deal with these NaN's, so we're lazy and just replace them all with 3's
dfs = pd.SparseDataFrame(sparse_matrix, \
                     index=user_c.categories, \
                     columns=movie_c.categories)#, \
                     #default_fill_value=0)

# Workflow

In [21]:
#first we do an overall test train split along user lines using sklearn
#    assuming dfs is user x movie
ratio = .2
X_train, X_test = train_test_split(dfs,test_size=ratio)
#if you want to do it along movie lines, tranpose the matrix before you pass it in.

In [22]:
def set_up_scoring(X):
    """
    hide a rating for each user from the matrix so we have a label y to test accuracy on.
    """
    X = X.to_dense()
    y = np.zeros(X.shape[0])
    indices = np.zeros(X.shape[0])
    #for each row, get non nan entries
    exist = X[pd.notna(X)]
    #randomly select a movie from the remaining
    for i in range(X.shape[0]):
        movie = np.random.choice(exist.columns)
        indices[i] = movie
        y[i] = exist.loc[:,movie].iloc[i]
        
        X.loc[:,movie].iloc[i] = np.nan #remove that rating from X
    #store as a series, with movieId as index and rating as value
    return X.to_sparse(),pd.Series(data=y,index=indices)

def custom_scoring(X,y):
    """
    given a completed ratings matrix X, find out if our hidden ratings were accurately predicted
    """
    #first we have to extract y_pred from X
    y_pred = np.zeros_like(y)
    for i in range(len(y)):
        movie = int(y.index[i])
        y_pred[i] = X.loc[:,movie].iloc[i]
    
    return mean_squared_error(y,y_pred)
        