In [1]:
import numpy as np

In [10]:
def movie_dict(arr):
    unique, counts = np.unique(arr, return_counts=True)
    k=len(counts)
    L=len(arr)
    movie_id_dict = {}

    for movie_id in arr:
        movie_id_dict[str(movie_id)] = -1
    for i,key in enumerate(movie_id_dict.keys()):
        movie_id_dict[key] = i
    return movie_id_dict
def get_movies_for_user(user,data,movie_mapping):
    data = data[data[:,0]==user]
    movies_user_seen=[]
    for movie in data[:,1]:
        movies_user_seen.append(movie_mapping[str(movie)])
    return movies_user_seen


def one_hot_encoding(data):
    #data = data.astype(int)
    L = data.shape[0]
    N = int(np.max(data[:,0]))
    unique, counts = np.unique(data[:,1], return_counts=True)
    k=len(counts)
    matrix_encoding_user = np.zeros((L,N))
    matrix_encoding_movie = np.zeros((L,k))
    matrix_encoding_other_movies_rated = np.zeros((L,k))
    matrix_encoding_last_movie_rated = np.zeros((L,k))
    movie_mapping = movie_dict(data[:,1])
    matrix_encoding_time = np.array(data[:,3])
    max_time=np.max(matrix_encoding_time)
    min_time=np.min(matrix_encoding_time)
    matrix_encoding_time = ((matrix_encoding_time-min_time)/max_time)

    for i,user in enumerate(data[:,0]):
        matrix_encoding_user[i,int(user)-1] = 1
    for i,movie in enumerate(data[:,1]):
        matrix_encoding_movie[i,movie_mapping[str(movie)]] = 1
    old_user=-1
    for i,user in enumerate(data[:,0]):
        if old_user!=user:
            user_movies = get_movies_for_user(user,data,movie_mapping)
        matrix_encoding_other_movies_rated[i,user_movies] = 1/len(user_movies)
    old_user=-1
    for i,user in enumerate(data[:,0]):
        if old_user==user:
            matrix_encoding_last_movie_rated[i,:] = matrix_encoding_movie[i-1,:]
        old_user=user
    
    
    full_matrix = np.concatenate((matrix_encoding_user,matrix_encoding_movie,matrix_encoding_other_movies_rated,matrix_encoding_time[:, None],matrix_encoding_last_movie_rated),axis=1)
    return full_matrix
def one_hot_encoding_simple(data):
    #data = data.astype(int)
    L = data.shape[0]
    N = int(np.max(data[:,0]))
    unique, counts = np.unique(data[:,1], return_counts=True)
    k=len(counts)
    matrix_encoding_user = np.zeros((L,N))
    matrix_encoding_movie = np.zeros((L,k))
    
    movie_mapping = movie_dict(data[:,1])
    

    for i,user in enumerate(data[:,0]):
        matrix_encoding_user[i,int(user)-1] = 1
    for i,movie in enumerate(data[:,1]):
        matrix_encoding_movie[i,movie_mapping[str(movie)]] = 1
    
    
    
    full_matrix = np.concatenate((matrix_encoding_user,matrix_encoding_movie),axis=1)
    return full_matrix

def one_hot_encoding_simple_user_movies(data):
    #data = data.astype(int)
    L = data.shape[0]
    N = int(np.max(data[:,0]))
    unique, counts = np.unique(data[:,1], return_counts=True)
    k=len(counts)
    matrix_encoding_user = np.zeros((L,N))
    matrix_encoding_movie = np.zeros((L,k))
    matrix_encoding_other_movies_rated = np.zeros((L,k))
    matrix_encoding_last_movie_rated = np.zeros((L,k))
    movie_mapping = movie_dict(data[:,1])
    matrix_encoding_time = np.array(data[:,3])
    max_time=np.max(matrix_encoding_time)
    min_time=np.min(matrix_encoding_time)
    matrix_encoding_time = ((matrix_encoding_time-min_time)/max_time)

    for i,user in enumerate(data[:,0]):
        matrix_encoding_user[i,int(user)-1] = 1
    for i,movie in enumerate(data[:,1]):
        matrix_encoding_movie[i,movie_mapping[str(movie)]] = 1
    old_user=-1
    for i,user in enumerate(data[:,0]):
        if old_user!=user:
            user_movies = get_movies_for_user(user,data,movie_mapping)
        matrix_encoding_other_movies_rated[i,user_movies] = 1/len(user_movies)
    full_matrix = np.concatenate((matrix_encoding_user,matrix_encoding_movie,matrix_encoding_other_movies_rated),axis=1)
    return full_matrix

In [3]:
from scipy import sparse
"""
WARNING very memory intensive do not run unless needed!!!

Generate the sparse one hot encoded matrix that includes:
User | movie | other movies rated | Time | Last movie rated
"""
filename="ml-latest-small/ratings.csv"
data=np.genfromtxt(filename,skip_header=1,delimiter=",")
unique, counts = np.unique(data[:,1], return_counts=True)
full_matrix_test = one_hot_encoding(data)
full_matrix_sparse = sparse.csr_matrix(full_matrix_test,dtype=float)



In [4]:
"""
Save the matrix as .npz
"""
full_matrix_test.shape
sparse.save_npz('ml_latest_small_formated.npz', full_matrix_sparse)
#print(len(counts))

In [7]:
import numpy as np
from scipy import sparse
filename="ml-latest-small/ratings.csv"
data=np.genfromtxt(filename,skip_header=1,delimiter=",")
full_matrix_simple = one_hot_encoding_simple(data)
full_matrix_simple_sparse = sparse.csr_matrix(full_matrix_simple,dtype=float)
sparse.save_npz('ml_latest_small_simple_formated.npz', full_matrix_simple_sparse)


In [11]:
import numpy as np
from scipy import sparse
filename="ml-latest-small/ratings.csv"
data=np.genfromtxt(filename,skip_header=1,delimiter=",")
full_matrix_simple_user_movies = one_hot_encoding_simple_user_movies(data)
full_matrix_simple_user_movies_sparse = sparse.csr_matrix(full_matrix_simple_user_movies,dtype=float)
sparse.save_npz('ml_latest_small_simple_user_movies_formated.npz', full_matrix_simple_user_movies_sparse)

In [8]:
"""
Generate a dictionary with movie id as key and a one hot encoded
genre array as the value.
This is used to generate a matrix of one hot encoded genres
to append to the already created sparse matrix
"""
genres_filename="ml-latest-small/movies.csv"
f = open(genres_filename, "r")
f.readline()
nrMovies=x = len(f.readlines())
f = open(genres_filename, "r")
f.readline()
genre_dict={"unknown":0,"Action":1,"Adventure":2,"Animation":3,"Children":4,"Comedy":5,
"Crime":6,"Documentary":7,"Drama":8,"Fantasy":9,"Film-Noir":10,"Horror":11,"Musical":12,
"Mystery":13,"Romance":14,"Sci-Fi":15,"Thriller":16,"War":17,"Western":18,"IMAX":19}
movie_genre_dict={}
nrexcluded=0
#genre_matrix=np.zeros((nrMovies,len(genre_dict)))
for line in f.readlines():
    movie_ind = line.split(",")[0]
    genres_string = line.split(",")[-1]
    genres_string=genres_string[:-1]
    movie_encoded=np.zeros((1,len(genre_dict)))
    print(line)
    for genre in genres_string.split("|"):
        try:
            movie_encoded[0,genre_dict[genre]] = 1
            movie_genre_dict[movie_ind]=movie_encoded
            
        except:
            nrexcluded=nrexcluded+1
print(movie_genre_dict)
print("Nr. exluding",nrexcluded)


1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy

2,Jumanji (1995),Adventure|Children|Fantasy

3,Grumpier Old Men (1995),Comedy|Romance

4,Waiting to Exhale (1995),Comedy|Drama|Romance

5,Father of the Bride Part II (1995),Comedy

6,Heat (1995),Action|Crime|Thriller

7,Sabrina (1995),Comedy|Romance

8,Tom and Huck (1995),Adventure|Children

9,Sudden Death (1995),Action

10,GoldenEye (1995),Action|Adventure|Thriller

11,"American President, The (1995)",Comedy|Drama|Romance

12,Dracula: Dead and Loving It (1995),Comedy|Horror

13,Balto (1995),Adventure|Animation|Children

14,Nixon (1995),Drama

15,Cutthroat Island (1995),Action|Adventure|Romance

16,Casino (1995),Crime|Drama

17,Sense and Sensibility (1995),Drama|Romance

18,Four Rooms (1995),Comedy

19,Ace Ventura: When Nature Calls (1995),Comedy

20,Money Train (1995),Action|Comedy|Crime|Drama|Thriller

21,Get Shorty (1995),Comedy|Crime|Thriller

22,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller

23,Assassins (1995),A

In [12]:
"""
Generate the genre matrix from the dictionary created above
append it to the already created sparse matrix
save it as a seperate file.
"""
from scipy import sparse
def generate_genre_matrix(data,movie_genre_dict):
    #filename="ml-latest-small/ratings.csv"
    #data=np.genfromtxt(filename,skip_header=1,delimiter=",")
    movieID=data[:,1].astype(int)
    genre_matrix = np.zeros((len(movieID),len(genre_dict)))
    for i,id in enumerate(movieID):
        try:
            genre_matrix[i,:] = movie_genre_dict[str(id)]
        except:
            None
    genre_matrix = sparse.csr_matrix(genre_matrix,dtype=float)
    return genre_matrix

genre_matrix=generate_genre_matrix(data,movie_genre_dict)
#full_matrix_sparse = sparse.load_npz("ml_latest_small_formated.npz")
#full_matrix_sparse = sparse.load_npz("ml_latest_small_simple_formated.npz")
full_matrix_sparse = sparse.load_npz("ml_latest_small_simple_user_movies_formated.npz")
full_matrix_sparse_with_genres = sparse.hstack([full_matrix_sparse,genre_matrix])
sparse.save_npz('ml_latest_small_simple_user_movies_formated_with_genres.npz', full_matrix_sparse_with_genres)
print(full_matrix_sparse.shape)
print(full_matrix_sparse_with_genres.shape)




(100836, 20058)
(100836, 20078)


: 