# SSII - Práctica Final: Recomendador de películas con MovieLens

## Grupo 1: Ignacio Gil, Álvaro Farreny y Carlos González

In [39]:
""" 
pip install pandas
pip install numpy
python -m pip install -U matplotlib
pip install seaborn
python -m pip install requests
pip install -U scikit-learn   
pip install beautifulsoup4
"""

import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
import seaborn as sns
import requests as rq
import time
import math
import datetime

from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.metrics import confusion_matrix

from bs4 import BeautifulSoup as bs

In [40]:
# Leemos nuestros dataframes
movies = pd.read_csv("./ml-latest-small/movies.csv", sep=",")
ratings = pd.read_csv("./ml-latest-small/ratings.csv", sep=",")
tags = pd.read_csv("./ml-latest-small/tags.csv", sep=",")
links = pd.read_csv("./ml-latest-small/links.csv", sep=",")

# Eliminamos nuelos
movies.dropna(inplace=True)
ratings.dropna(inplace=True)
tags.dropna(inplace=True)
links.dropna(inplace=True)

In [41]:
# Extraemos el año del título
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year
movies.title = movies.title.str[:-7]

In [42]:
'''# Separamos los géneros
movies['genres'] = movies['genres'].str.split('|')
dfx = pd.get_dummies(pd.DataFrame(movies['genres'].tolist()).stack()).sum(level=0)
movies = pd.concat([movies, dfx], axis=1).drop(columns=['genres'])

generos = movies[['movieId', '(no genres listed)', 'Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].copy()'''

"# Separamos los géneros\nmovies['genres'] = movies['genres'].str.split('|')\ndfx = pd.get_dummies(pd.DataFrame(movies['genres'].tolist()).stack()).sum(level=0)\nmovies = pd.concat([movies, dfx], axis=1).drop(columns=['genres'])\n\ngeneros = movies[['movieId', '(no genres listed)', 'Action', 'Adventure',\n       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',\n       'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery',\n       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].copy()"

In [43]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [44]:
# Cambiamos los timestamps de ratings y tags de segundos a años
ratings.timestamp = pd.to_datetime(ratings.timestamp, infer_datetime_format=True)
ratings.timestamp = ratings.timestamp.dt.year

tags.timestamp = pd.to_datetime(tags.timestamp, infer_datetime_format=True)
tags.timestamp = tags.timestamp.dt.year

In [45]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1970
1,1,3,4.0,1970
2,1,6,4.0,1970
3,1,47,5.0,1970
4,1,50,5.0,1970


In [46]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [48]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1970
1,2,60756,Highly quotable,1970
2,2,60756,will ferrell,1970
3,2,89774,Boxing story,1970
4,2,89774,MMA,1970


In [49]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# TF-IDF para las diferentes combinaciones de géneros
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                     for c in combinations(s.split('|'), r=i)))

# Formamos nuestra matriz con los géneros
matriz = tf.fit_transform(movies.genres)
# Aplicamos a esta matriz la similitud del coseno
similitud = cosine_similarity(matriz)
# Creamos un DF con esta similitud
similitud_df = pd.DataFrame(similitud, index=movies['title'], columns=movies['title'])

In [60]:
similitud_df

title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Gintama: The Movie,anohana: The Flower We Saw That Day - The Movie,Silver Spoon,Love Live! The School Idol Movie,Jon Stewart Has Left the Building,Black Butler: Book of the Atlantic,No Game No Life: Zero,Flint,Bungo Stray Dogs: Dead Apple,Andrew Dice Clay: Dice Rules
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,1.000000,0.474735,0.033432,0.019663,0.082550,0.000000,0.033432,0.275655,0.000000,0.038862,...,0.090020,0.084617,0.038306,0.159254,0.0,0.306924,0.487104,0.00000,0.086065,0.082550
Jumanji,0.474735,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.580651,0.000000,0.081861,...,0.000000,0.000000,0.000000,0.000000,0.0,0.060495,0.096008,0.00000,0.000000,0.000000
Grumpier Old Men,0.033432,0.000000,1.000000,0.588129,0.404997,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.043147,0.000000,0.187935,0.000000,0.0,0.043247,0.068635,0.00000,0.000000,0.404997
Waiting to Exhale,0.019663,0.000000,0.588129,1.000000,0.238191,0.000000,0.588129,0.000000,0.000000,0.000000,...,0.025376,0.055954,0.513299,0.000000,0.0,0.025435,0.040366,0.21998,0.000000,0.238191
Father of the Bride Part II,0.082550,0.000000,0.404997,0.238191,1.000000,0.000000,0.404997,0.000000,0.000000,0.000000,...,0.106537,0.000000,0.464039,0.000000,0.0,0.106784,0.169471,0.00000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic,0.306924,0.060495,0.043247,0.025435,0.106784,0.040749,0.043247,0.000000,0.146144,0.038802,...,0.393689,0.109457,0.049552,0.206006,0.0,1.000000,0.630100,0.00000,0.381193,0.106784
No Game No Life: Zero,0.487104,0.096008,0.068635,0.040366,0.169471,0.000000,0.068635,0.000000,0.000000,0.000000,...,0.184807,0.173714,0.078641,0.326942,0.0,0.630100,1.000000,0.00000,0.176687,0.169471
Flint,0.000000,0.000000,0.000000,0.219980,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.254359,0.428561,0.000000,0.0,0.000000,0.000000,1.00000,0.000000,0.000000
Bungo Stray Dogs: Dead Apple,0.086065,0.000000,0.000000,0.000000,0.000000,0.106898,0.000000,0.000000,0.383386,0.101790,...,0.380312,0.287143,0.000000,0.540424,0.0,0.381193,0.176687,0.00000,1.000000,0.000000


In [12]:
# Nuestro valor k indicará la cantidad de recomendaciones que damos
def genre_recommendations(i, M, items, k=10):
    """
    i : str
        Movie (index of the similarity dataframe)
    M : pd.DataFrame
        Similarity dataframe, symmetric, with movies as indices and columns
    items : pd.DataFrame
        Contains both the title and some other features used to define similarity
    k : int
        Amount of recommendations to return
    """
    index = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[index[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

In [13]:
genre_recommendations('Toy Story', similitud_df, movies[['title', 'genres']])

Unnamed: 0,title,genres
0,"Adventures of Rocky and Bullwinkle, The",Adventure|Animation|Children|Comedy|Fantasy
1,"Emperor's New Groove, The",Adventure|Animation|Children|Comedy|Fantasy
2,Toy Story 2,Adventure|Animation|Children|Comedy|Fantasy
3,Antz,Adventure|Animation|Children|Comedy|Fantasy
4,Turbo,Adventure|Animation|Children|Comedy|Fantasy
5,"Tale of Despereaux, The",Adventure|Animation|Children|Comedy|Fantasy
6,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
7,Shrek the Third,Adventure|Animation|Children|Comedy|Fantasy
8,The Good Dinosaur,Adventure|Animation|Children|Comedy|Fantasy
9,Asterix & Obelix vs. Caesar (Astérix et Obélix...,Adventure|Children|Comedy|Fantasy


-------

In [6]:
import pandas as pd
from pandastable import Table

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

from tkinter import *
from tkinter import ttk

# Leemos nuestros dataframes
movies = pd.read_csv("./ml-latest-small/movies.csv", sep=",")
ratings = pd.read_csv("./ml-latest-small/ratings.csv", sep=",")
tags = pd.read_csv("./ml-latest-small/tags.csv", sep=",")
links = pd.read_csv("./ml-latest-small/links.csv", sep=",")
sinopsis = pd.read_csv("./ml-latest-small/sinopsisDB.csv", sep=",")

#sinopsis = sinopsis.sort_values('moveId', ascending=True)
sinopsis = sinopsis.loc[~sinopsis.index.duplicated()]
''' Eliminamos nulos
movies.dropna(inplace=True)
ratings.dropna(inplace=True)
tags.dropna(inplace=True)
links.dropna(inplace=True)'''

# Extraemos el año del título
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year
movies.title = movies.title.str[:-7]

# TF-IDF para las diferentes combinaciones de géneros
tfG = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                     for c in combinations(s.split('|'), r=i)))
tfS = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                     for c in combinations(s.split(' '), r=i)))

# Formamos nuestra matriz con los géneros
matrizG = tfG.fit_transform(movies.genres)
matrizS = tfS.fit_transform(sinopsis.sinopsis)
# Aplicamos a esta matriz la similitud del coseno
similitudG = cosine_similarity(matrizG)
similitudS = cosine_similarity(matrizS)

KeyboardInterrupt: 

In [4]:
print(matrizG)

  (0, 361)	0.23844431625303342
  (0, 285)	0.25339260812741005
  (0, 273)	0.238896403927517
  (0, 270)	0.21596189876114483
  (0, 175)	0.23417100415489422
  (0, 162)	0.22639662355789053
  (0, 159)	0.2207100303410543
  (0, 148)	0.23935337913101462
  (0, 145)	0.22919233942668918
  (0, 144)	0.21596189876114483
  (0, 450)	0.1902447041134256
  (0, 386)	0.20446731645442034
  (0, 358)	0.18133325637098957
  (0, 317)	0.21193315703457719
  (0, 282)	0.19384430718319204
  (0, 269)	0.18897014138245147
  (0, 214)	0.1847263501096727
  (0, 171)	0.17723047770258227
  (0, 158)	0.18759762487561235
  (0, 143)	0.19743695283468413
  (0, 638)	0.14900146727529373
  (0, 419)	0.08254987867092359
  (0, 357)	0.15574379029402177
  (0, 268)	0.15925449786387186
  (0, 142)	0.12859603438786546
  :	:
  (9737, 24)	0.3640478046022315
  (9737, 19)	0.2855027014686476
  (9737, 50)	0.349950040553601
  (9737, 88)	0.26475269709489324
  (9737, 46)	0.22530468679044202
  (9737, 1)	0.14614415646832615
  (9737, 285)	0.327779729995828

In [5]:
print(matrizS)

  (0, 2685)	1.0
  (1, 6400)	1.0
  (2, 4785)	1.0
  (3, 4480)	1.0
  (4, 5123)	1.0
  (5, 2429)	1.0
  (6, 509)	1.0
  (7, 7928)	1.0
  (8, 7979)	1.0
  (9, 5045)	1.0
  (10, 3041)	1.0
  (11, 8294)	1.0
  (12, 5610)	1.0
  (13, 7939)	1.0
  (14, 2426)	1.0
  (15, 899)	1.0
  (16, 7952)	1.0
  (17, 3055)	1.0
  (18, 2245)	1.0
  (19, 4048)	1.0
  (20, 2588)	1.0
  (21, 7704)	1.0
  (22, 6718)	1.0
  (23, 2486)	1.0
  (24, 1175)	1.0
  :	:
  (8658, 8312)	1.0
  (8659, 7304)	1.0
  (8660, 3786)	1.0
  (8661, 4466)	1.0
  (8662, 5046)	1.0
  (8663, 5268)	1.0
  (8664, 3719)	1.0
  (8665, 3789)	1.0
  (8666, 3125)	1.0
  (8667, 6108)	1.0
  (8668, 6422)	1.0
  (8669, 2986)	1.0
  (8670, 6017)	1.0
  (8671, 6011)	1.0
  (8672, 2939)	1.0
  (8673, 5720)	1.0
  (8674, 1547)	1.0
  (8675, 2115)	1.0
  (8676, 4474)	1.0
  (8677, 5612)	1.0
  (8678, 3012)	1.0
  (8679, 755)	1.0
  (8680, 3084)	1.0
  (8681, 1507)	1.0
  (8682, 5740)	1.0


-----

In [25]:
import pandas as pd
from surprise import Reader, Dataset
from surprise import KNNWithMeans

# Carga el dataframe de ratings
ratings_df = pd.read_csv("./ml-latest-small/ratings.csv")

# Crea un objeto Reader especificando el rango del rating
reader = Reader(rating_scale=(1, 5))

# Crea el dataset de surprise utilizando el dataframe de ratings y el objeto reader
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Utiliza el algoritmo KNN con el promedio de los ratings de los vecinos más cercanos
algo = KNNWithMeans(k=20, sim_options={'name': 'pearson_baseline', 'user_based': True})

# Entrena el modelo con los datos
algo.fit(data.build_full_trainset())

# Hacer recomendaciones a un usuario específico
user_id = 1
user_items = ratings_df.loc[ratings_df['userId'] == user_id]
user_items = user_items

# Seleccionar las peliculas no vistas por el usuario
user_unrated = ratings_df.loc[(ratings_df['userId'] == user_id) & (ratings_df['rating'] == 0)]

# Hacer predicciones de rating para las peliculas no vistas
predictions = algo.test([(user_id, movie_id, 0) for (movie_id) in user_unrated['movieId']])

# Ordenar las peliculas por puntuacion
predictions.sort(key=lambda x: x.est, reverse=True)

# Imprimir las recomendaciones
print("Recomendaciones para el usuario {}:".format(user_id))
for prediction in predictions:
    print(" - Película: {}, Puntuación: {:.2f}".format(prediction.iid, prediction.est))



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Recomendaciones para el usuario 1:


In [26]:
import pandas as pd
from surprise import Reader, Dataset
from surprise import SVD

# Carga el dataframe de ratings
ratings_df = pd.read_csv("./ml-latest-small/ratings.csv")

# Crea un objeto Reader especificando el rango del rating
reader = Reader(rating_scale=(1, 5))

# Crea el dataset de surprise utilizando el dataframe de ratings y el objeto reader
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Utiliza el algoritmo SVD
algo = SVD()

# Entrena el modelo con los datos
algo.fit(data.build_full_trainset())

# Hacer recomendaciones a un usuario específico
user_id = 1
user_items = ratings_df.loc[ratings_df['userId'] == user_id]

# Seleccionar las peliculas no vistas por el usuario
user_unrated = ratings_df.loc[(ratings_df['userId'] == user_id) & (ratings_df['rating'] == 0)]

# Hacer predicciones de rating para las peliculas no vistas
predictions = algo.test([(user_id, movie_id, 0) for (movie_id) in user_unrated['movieId']])

# Ordenar las peliculas por puntuacion
predictions.sort(key=lambda x: x.est, reverse=True)

# Imprimir las recomendaciones
print("Recomendaciones para el usuario {}:".format(user_id))
for prediction in predictions:
    print(" - Película: {}, Puntuación: {:.2f}".format(prediction.iid, prediction.est))


Recomendaciones para el usuario 1:
