In [None]:
import numpy as np
def movie_dict(arr):
    unique, counts = np.unique(arr, return_counts=True)
    k=len(counts)
    L=len(arr)
    movie_id_dict = {}

    for movie_id in arr:
        movie_id_dict[str(movie_id)] = -1
    for i,key in enumerate(movie_id_dict.keys()):
        movie_id_dict[key] = i
    return movie_id_dict
def get_movies_for_user(user,data,movie_mapping):
    data = data[data[:,0]==user]
    movies_user_seen=[]
    for movie in data[:,1]:
        movies_user_seen.append(movie_mapping[str(movie)])
    return movies_user_seen


def one_hot_user(data):
    #data = data.astype(int)
    L = data.shape[0]
    N = int(np.max(data[:,0]))
    matrix_encoding_user = np.zeros((L,N))

    for i,user in enumerate(data[:,0]):
        matrix_encoding_user[i,int(user)-1] = 1
    
    return matrix_encoding_user

def one_hot_movies(data):
    L = data.shape[0]
    N = int(np.max(data[:,0]))
    unique, counts = np.unique(data[:,1], return_counts=True)
    k=len(counts)
    movie_mapping = movie_dict(data[:,1])
    matrix_encoding_movie = np.zeros((L,k))

    for i,movie in enumerate(data[:,1]):
        matrix_encoding_movie[i,movie_mapping[str(movie)]] = 1

    return matrix_encoding_movie

def one_hot_other_movies(data):
    #data = data.astype(int)
    L = data.shape[0]
    N = int(np.max(data[:,0]))
    unique, counts = np.unique(data[:,1], return_counts=True)
    k=len(counts)

    matrix_encoding_other_movies_rated = np.zeros((L,k))

    movie_mapping = movie_dict(data[:,1])

    for i,user in enumerate(data[:,0]):
        user_movies = get_movies_for_user(user,data,movie_mapping)
        matrix_encoding_other_movies_rated[i,user_movies] = 1/len(user_movies)
    return matrix_encoding_other_movies_rated

def one_hot_last_movie(data,matrix_encoding_movie):
    #data = data.astype(int)
    L = data.shape[0]
    N = int(np.max(data[:,0]))
    unique, counts = np.unique(data[:,1], return_counts=True)
    k=len(counts)

    matrix_encoding_last_movie_rated = np.zeros((L,k))
    
    old_user=-1
    for i,user in enumerate(data[:,0]):
        if old_user==user:
            matrix_encoding_last_movie_rated[i,:] = matrix_encoding_movie[i-1,:]
        old_user=user
    return matrix_encoding_last_movie_rated

def get_normal_time(data):

    matrix_encoding_time = np.array(data[:,3])
    max_time=np.max(matrix_encoding_time)
    min_time=np.min(matrix_encoding_time)
    matrix_encoding_time = ((matrix_encoding_time-min_time)/max_time)
    
    return matrix_encoding_time[:, None]
    
def get_movie_genre_dict():
    """
    Generate a dictionary with movie id as key and a one hot encoded
    genre array as the value.
    This is used to generate a matrix of one hot encoded genres
    to append to the already created sparse matrix
    """
    genres_filename="ml-latest-small/movies.csv"
    f = open(genres_filename, "r")
    f.readline()
    nrMovies= len(f.readlines())
    f = open(genres_filename, "r")
    f.readline()
    genre_dict={"unknown":0,"Action":1,"Adventure":2,"Animation":3,"Children":4,"Comedy":5,
    "Crime":6,"Documentary":7,"Drama":8,"Fantasy":9,"Film-Noir":10,"Horror":11,"Musical":12,
    "Mystery":13,"Romance":14,"Sci-Fi":15,"Thriller":16,"War":17,"Western":18,"IMAX":19}
    movie_genre_dict={}
    nrexcluded=0
    #genre_matrix=np.zeros((nrMovies,len(genre_dict)))
    for line in f.readlines():
        movie_ind = line.split(",")[0]
        genres_string = line.split(",")[-1]
        genres_string=genres_string[:-1]
        movie_encoded=np.zeros((1,len(genre_dict)))
        #print(line)
        for genre in genres_string.split("|"):
            try:
                movie_encoded[0,genre_dict[genre]] = 1
                movie_genre_dict[movie_ind]=movie_encoded
                
            except:
                nrexcluded=nrexcluded+1
    #print(movie_genre_dict)
    print("Nr. exluding",nrexcluded)
    return movie_genre_dict, genre_dict

def generate_genre_matrix(data,movie_genre_dict,genre_dict):
    #filename="ml-latest-small/ratings.csv"
    #data=np.genfromtxt(filename,skip_header=1,delimiter=",")
    movieID=data[:,1].astype(int)
    genre_matrix = np.zeros((len(movieID),len(genre_dict)))
    for i,id in enumerate(movieID):
        try:
            genre_matrix[i,:] = movie_genre_dict[str(id)]
        except:
            None
    genre_matrix = sparse.csr_matrix(genre_matrix,dtype=float)
    return genre_matrix

In [None]:
from scipy import sparse
"""
WARNING very memory intensive do not run unless needed!!!

Generate the sparse one hot encoded matrix that includes:
User | movie | other movies rated | Time | Last movie rated
"""
filename="ml-latest-small/ratings.csv"
data=np.genfromtxt(filename,skip_header=1,delimiter=",")
user_encoded = sparse.csr_matrix(one_hot_user(data),dtype=float)
print("user done")
movies_encoded = one_hot_movies(data)
print("movies done")
other_movies = sparse.csr_matrix(one_hot_other_movies(data),dtype=float)
print("other movies done")
last_movies = sparse.csr_matrix(one_hot_last_movie(data,movies_encoded),dtype=float)
print("last movies done")
movies_encoded = sparse.csr_matrix(movies_encoded,dtype=float)
time = sparse.csr_matrix(get_normal_time(data),dtype=float)
print("time done")

movie_genre_dict, genre_dict = get_movie_genre_dict()
genre_matrix=generate_genre_matrix(data,movie_genre_dict,genre_dict)
print("all done :)")

#full_matrix_sparse = sparse.csr_matrix(full_matrix_test,dtype=float)

In [None]:
import seaborn

In [None]:
from scipy import sparse

full_matrix = sparse.hstack([user_encoded,movies_encoded,last_movies,genre_matrix])
sparse.save_npz('user_encoded-movies_encoded-last_movies-genre_matrix.npz', full_matrix)
#full_matrix_sparse = sparse.hstack((user_encoded,movies_encoded,time))