In [7]:
#Importando Librerias
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
import pickle
import time


In [8]:
#Constants
PATH_RATINGS_DF = "static/data/ratings.csv"
PATH_MOVIES_DF = "static/data/movies.csv"

In [9]:


print('Creando Dataframe Ratings....')
start_ratings_ds = time.time()
chunk_rating = pd.read_csv(PATH_RATINGS_DF,chunksize=1000000 ,
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
end_ratings_ds = time.time()
df_ratings = pd.concat(chunk_rating)
print("Dataframe Ratings creado en: ",(end_ratings_ds-start_ratings_ds),"sec")

print('Creando Dataframe Movies....')
start_movies_ds = time.time()
chunk_movie = pd.read_csv(PATH_MOVIES_DF,chunksize=1000000,
    usecols=['movieId', 'title',"genres"],
    dtype={'movieId': 'int32', 'title': 'str',"genres":"str"})
end_movies_ds = time.time()
df_movies = pd.concat(chunk_movie)
print("Dataframe Movies creado en: ",(end_ratings_ds-start_ratings_ds),"sec")


Creando Dataframe Ratings....
Dataframe Ratings creado en:  0.014435768127441406 sec
Creando Dataframe Movies....
Dataframe Movies creado en:  0.014435768127441406 sec


In [10]:
#Información del dataframe Movies
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int32 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int32(1), object(2)
memory usage: 1.1+ MB


In [11]:
#Información del dataframe de Ratings
df_ratings.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 317.6 MB


In [None]:
#Visualizando datos del dataframe Ratings
df_ratings.head()

In [None]:
sorted(df_ratings['rating'].unique()) #Listado de ratings en orden ascendente

In [None]:
df_ratings.groupby(['rating'])['userId'].count() # Agrupación de rating por usuario

In [None]:
plt.hist(df_ratings['rating'],bins=10)

In [None]:
#Inspeccionar Data
n_users = df_ratings['userId'].unique().shape[0]
n_movies = df_ratings['movieId'].unique().shape[0]
n_ratings = df_ratings.shape[0]
ratings_per_user = n_ratings/n_users


print('Usuario: {}'.format(n_users))
print('Peliculas: {}'.format(n_movies))
print('Ratings: {}'.format(n_ratings))
print('Ratings por usuario: {}'.format(ratings_per_user))


In [12]:
print('Creando matrix movie/user...')


movie_categ = CategoricalDtype(sorted(df_ratings['movieId'].unique()),ordered=True)
user_categ = CategoricalDtype(sorted(df_ratings['userId'].unique()),ordered=True)

#CSR MATRIX
row = df_ratings['movieId'].astype(movie_categ).cat.codes
col = df_ratings['userId'].astype(user_categ).cat.codes

data = df_ratings['rating'].values

df_csr = csr_matrix((data,(row,col)),shape=(movie_categ.categories.size,
                                            user_categ.categories.size))

print('Matrix creada')


Creando matrix movie/user...
Matrix creada


In [15]:
print('Creando Modelo...')
#Coseno , buscar los elementos mas cercanos, teniendo en cuenta el valor del coseno
# En caso los elementos sean similares los va agrupar haciendo clusters diferentes
model_knn = NearestNeighbors(metric='cosine',algorithm='brute')
model_knn.fit(df_csr)
print('Modelo creado')

print('Serializando modelo....')
pickle.dump(model_knn, open('static/model/knn_model.pickle', 'wb')) #Guardar modelo serializado
print('Modelo serializado guardado')


Creando Modelo...
Modelo creado


In [16]:

#Crear sistema de recomendación , Collaborative Filtering Item

def recommender_movie(movie_id,n_results = 5):
    
    # Ubicar indice de la fila

    query_index = movie_categ.categories.get_loc(movie_id)
    
    print('Movie Id: {0} - Matrix index: {1}'.format(movie_id,query_index))

    query_vector = df_csr[query_index]
    
    n_results += 1
    distances, indices = model_knn.kneighbors(query_vector,n_neighbors=n_results)


    for i in range(0,len(distances.flatten())):
        if i == 0:
        
            movie = df_movies[df_movies['movieId']==movie_id]
            print('Recommendations for {0}:\n'.format(movie['title'].values[0]))


        else:
    
            idx = movie_categ.categories[indices.flatten()[i]]
         
            movie = df_movies[df_movies['movieId']==idx]
            movie_title = movie['title'].values[0]
            movie_genres = movie['genres'].values[0]
            
         
            movie_dist = distances.flatten()[i] *100
    
            print('{0}: {1:4.1f}% :: {2} - {3}'.format(i,movie_dist,movie_title,movie_genres))
             
      
recommender_movie(4369,5)

Movie Id: 4369 - Matrix index: 4275
Recommendations for Fast and the Furious, The (2001):

1: 53.6% :: 2 Fast 2 Furious (Fast and the Furious 2, The) (2003) - Action|Crime|Thriller
2: 56.5% :: xXx (2002) - Action|Crime|Thriller
3: 57.3% :: Gone in 60 Seconds (2000) - Action|Crime
4: 60.1% :: Lara Croft: Tomb Raider (2001) - Action|Adventure
5: 60.1% :: Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006) - Action|Crime|Drama|Thriller
