In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
import pandas as pd
from surprise import SVD
import streamlit as st
import gc #garbage collector

In [2]:
df = pd.read_parquet('archivoparquetsinnulos.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11024289 entries, 0 to 11024288
Data columns (total 18 columns):
 #   Column         Dtype         
---  ------         -----         
 0   show_id        object        
 1   type           object        
 2   title          object        
 3   director       object        
 4   cast           object        
 5   country        object        
 6   date_added     object        
 7   release_year   object        
 8   rating         object        
 9   listed_in      object        
 10  id             object        
 11  platform       object        
 12  duration_int   int32         
 13  duration_type  object        
 14  userId         int64         
 15  score          float64       
 16  date           datetime64[ns]
 17  score_mean     float64       
dtypes: datetime64[ns](1), float64(2), int32(1), int64(1), object(13)
memory usage: 1.5+ GB


In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,id,platform,duration_int,duration_type,userId,score,date,score_mean
0,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30 00:00:00,2014,g,"comedy, drama",as1,amazon,113,min,543,5.0,2003-07-30,3.47
1,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30 00:00:00,2014,g,"comedy, drama",as1,amazon,113,min,595,3.0,1996-08-13,3.47
2,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30 00:00:00,2014,g,"comedy, drama",as1,amazon,113,min,611,3.0,2001-01-03,3.47
3,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30 00:00:00,2014,g,"comedy, drama",as1,amazon,113,min,2523,3.5,2012-06-25,3.47
4,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30 00:00:00,2014,g,"comedy, drama",as1,amazon,113,min,3082,4.0,2000-03-30,3.47


In [4]:
df2 = df.loc[:, ["id", "userId", "score","title"]]
df2.head()

Unnamed: 0,id,userId,score,title
0,as1,543,5.0,the grand seduction
1,as1,595,3.0,the grand seduction
2,as1,611,3.0,the grand seduction
3,as1,2523,3.5,the grand seduction
4,as1,3082,4.0,the grand seduction


In [5]:
# Utilizar factorize para asignar valores numéricos a cada valor único de ID
df2['id_factorized'] = pd.factorize(df['id'])[0]
df2.head()

Unnamed: 0,id,userId,score,title,id_factorized
0,as1,543,5.0,the grand seduction,0
1,as1,595,3.0,the grand seduction,0
2,as1,611,3.0,the grand seduction,0
3,as1,2523,3.5,the grand seduction,0
4,as1,3082,4.0,the grand seduction,0


In [6]:
df1 = df2.loc[:,['userId','score','id_factorized']]
df_title = df2.loc[:,['id_factorized','title']]

In [7]:
df_title.drop_duplicates(inplace=True)
df_title.head(10)

Unnamed: 0,id_factorized,title
0,0,the grand seduction
502,1,take care good night
995,2,secrets of deception
1417,3,pink: staying true
1890,4,monster maker
2366,5,living with dinosaurs
2828,6,hired gun
3354,7,grease live!
3875,8,global meltdown
4369,9,david's mother


In [8]:
movie_id = 10
titulo = df_title.loc[df_title['id_factorized'] == movie_id, 'title'].iloc[0]
print(titulo)

forest fairies


In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11024289 entries, 0 to 11024288
Data columns (total 3 columns):
 #   Column         Dtype  
---  ------         -----  
 0   userId         int64  
 1   score          float64
 2   id_factorized  int64  
dtypes: float64(1), int64(2)
memory usage: 336.4 MB


In [10]:
df1

Unnamed: 0,userId,score,id_factorized
0,543,5.0,0
1,595,3.0,0
2,611,3.0,0
3,2523,3.5,0
4,3082,4.0,0
...,...,...,...
11024284,122699,4.0,22997
11024285,122869,4.0,22997
11024286,123708,3.0,22997
11024287,123841,4.0,22997


In [11]:
df_title.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22998 entries, 0 to 11023810
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id_factorized  22998 non-null  int64 
 1   title          22998 non-null  object
dtypes: int64(1), object(1)
memory usage: 539.0+ KB


In [12]:
# Usuarios
len(df1['userId'].unique())

115078

In [13]:
# Calificaciones de películas por usuario
df1_by_users = df1.groupby(['userId']).count()
df1_by_users.head()

Unnamed: 0_level_0,score,id_factorized
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,27,27
2,22,22
3,10,10
4,62,62
5,26,26


In [14]:
#conteo de calificaciones por pelicula/serie
df1_by_movies = df1.groupby(['id_factorized']).count()
df1_by_movies.head()

Unnamed: 0_level_0,userId,score
id_factorized,Unnamed: 1_level_1,Unnamed: 2_level_1
0,502,502
1,493,493
2,422,422
3,473,473
4,476,476


In [15]:
df_title = df_title.set_index('id_factorized')
df_title.head()

Unnamed: 0_level_0,title
id_factorized,Unnamed: 1_level_1
0,the grand seduction
1,take care good night
2,secrets of deception
3,pink: staying true
4,monster maker


In [16]:
#titulo mas calificado
idx_max = df1_by_movies['userId'].idxmax()
print(df_title.loc[idx_max].title)

from other worlds


In [17]:
#Cantidad de vistas de cada serie/pelicula
df1_by_movies = df1.groupby(['id_factorized']).count()
df1_by_movies.sort_values('userId', ascending = False, inplace = True)
df1_by_movies['Vistos'] = df1_by_movies['userId']
df1_by_movies.drop(columns = ['userId','score'], inplace = True)
df1_by_movies.head(10)

Unnamed: 0_level_0,Vistos
id_factorized,Unnamed: 1_level_1
4196,576
9064,560
21469,558
847,558
6997,556
9468,556
13477,556
4888,554
11699,554
16037,553


In [18]:
#agrego el titulo a cada fila segun su id
df1_by_movies['Titulo'] = df_title.loc[df1_by_movies.index].title
df1_by_movies.head()

Unnamed: 0_level_0,Vistos,Titulo
id_factorized,Unnamed: 1_level_1,Unnamed: 2_level_1
4196,576,from other worlds
9064,560,the organization
21469,558,"leapfrog: sing-along, read-along"
847,558,superbook
6997,556,sideways


In [22]:
#filtro peliculas que consifero que se vieron poco en funcion de los datos
umbral = 420
mascara_pocos_vistos = df1_by_movies.Vistos<umbral
peliculas_pocos_vistos = mascara_pocos_vistos[mascara_pocos_vistos].index.values
print(len(peliculas_pocos_vistos), peliculas_pocos_vistos)
mascara_descartables = df1.id_factorized.isin(peliculas_pocos_vistos)


60 [16489 18413  7667  1623  1461 14376 20332 17476 11901  9872  4551 22282
 15775 20761  3788  9651  8278  6179 10294  1378  3368 19127  2227 22197
  7585  4564  9146   212 13389  9144 17249  2860  9050  9753  6317  6018
 13336 12083  7571 20273 15350  9594  5304  6123 17808 13316 21920 19415
 13782 20476  3716 10373 15443 19949 10449 11716 20725 12828 16334  4939]


In [23]:
# Obsevamos cómo cambia la cantidad de registros a partir del filtrado
print(df1.shape)
df1 = df1[~mascara_descartables]
print(df1.shape)

(11024289, 3)
(10999535, 3)


# Empiezamos con el modelo de recomendacion

In [25]:

reader = Reader()
N_filas = 200000 # Limitamos el dataset a N_filas
data = Dataset.load_from_df(df1[['userId', 'id_factorized', 'score']][:N_filas], reader)


In [26]:
# Separamos nuestros datos
trainset, testset = train_test_split(data, test_size=.25)


In [27]:
# Usaremos un modelo de Singular Value Decomposition
model = SVD()

In [29]:
# Entrenamos el modelo
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0xa77e7b2f80>

In [30]:
# Predecimos
predictions = model.test(testset)
predictions[1]

Prediction(uid=267143, iid=318, r_ui=5.0, est=3.9096877346449994, details={'was_impossible': False})

In [31]:
# Hacemos una predicción al azar para usuario y película
model.predict(1328945,28)

Prediction(uid=1328945, iid=28, r_ui=None, est=3.516103595569879, details={'was_impossible': False})

In [32]:
#obtengo el promedio total de los scores 
promedio_score = df1['score'].mean()
print(promedio_score)

3.533443004636105


In [33]:
#creo una funcion que recomiende o no ver una pelicula en funcion del score promedio y del score predicho por el modelo para ese usuario y esa pelicula

def recomendar_pelicula(id_usuario, id_pelicula, model):
    # Realizar la predicción para el usuario y la película dados
    prediction = model.predict(id_usuario, id_pelicula)
        # Obtener la valoración media del usuario para todas las películas que ha visto
    media_usuario = np.array(trainset.ur[trainset.to_inner_uid(id_usuario)])
        # Obtener el título de la película
    titulo_pelicula = df_title.loc[id_pelicula].title
        # Verificar si la valoración de la película es mayor que la media del usuario
    if prediction.est >3.54:
        return f"Se recomienda ver la película {titulo_pelicula}."
    else:
        return f"No se recomienda ver la película {titulo_pelicula}."
    

In [34]:
#realizo una recomendacion al azar
recomendar_pelicula(46453,66,model)

'Se recomienda ver la película world’s toughest race: eco-challenge fiji.'

In [43]:
# Tomaremos un usuario para hacerle una recomendación
usuario = 46453
rating = 4   # Tomamos películas a las que haya calificado con 4 o 5 estrellas
df_user = df1[(df1['userId'] == usuario) & (df1['score'] >= rating)]
df_user = df_user.reset_index(drop=True)
df_user['Name'] = df_title['title'].loc[df_user.id_factorized].values
df_user
recomendaciones_usuario = df_title.iloc[:22860].copy()
print(recomendaciones_usuario.shape)
recomendaciones_usuario.head()

(22860, 1)


Unnamed: 0_level_0,title
id_factorized,Unnamed: 1_level_1
0,the grand seduction
1,take care good night
2,secrets of deception
3,pink: staying true
4,monster maker


In [44]:
# Debemos extraer las películas que ya ha visto
usuario_vistas = df1[df1['userId'] == usuario]
print(usuario_vistas.shape)
usuario_vistas.head()
if True: # Sacamos las que filtramos
    recomendaciones_usuario.drop(peliculas_pocos_vistos, inplace = True)
recomendaciones_usuario.drop(usuario_vistas.id_factorized, inplace = True)
recomendaciones_usuario = recomendaciones_usuario.reset_index()


(43, 3)


In [45]:
recomendaciones_usuario.head()

Unnamed: 0,id_factorized,title
0,0,the grand seduction
1,1,take care good night
2,2,secrets of deception
3,3,pink: staying true
4,4,monster maker


In [47]:
# Recomendamos
recomendaciones_usuario['Estimate_Score'] = recomendaciones_usuario['id_factorized'].apply(lambda x: model.predict(usuario, x).est)


In [48]:
#Obtengo como resultado una lista de titulos con los score estimados para este usuarios ordenados de mayor a menor
recomendaciones_usuario = recomendaciones_usuario.sort_values('Estimate_Score', ascending=False)
print(recomendaciones_usuario.head(10))

     id_factorized                        title  Estimate_Score
215            217                   undocument        4.299041
137            138                welliewishers        4.211663
306            308             tom gleeson: joy        4.182833
359            361                   this is me        4.144778
187            188               victorian farm        4.133598
337            339           ticket to paradise        4.122331
143            144  we need to talk about kevin        4.114029
245            247                    tutu town        4.102747
78              79               women of valor        4.100936
13              13                   resilencia        4.096175
