# SSII - Práctica Final: Recomendador de películas con MovieLens

## Grupo 1: Ignacio Gil, Álvaro Farreny y Carlos González

In [150]:
""" pip install -U scikit-learn       
    python -m pip install -U matplotlib
    pip install seaborn """

import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
import seaborn as sns
import requests as rq
import time
import math
import datetime

from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.metrics import confusion_matrix

from bs4 import BeautifulSoup as bs

In [151]:
# Leemos nuestros dataframes
movies = pd.read_csv("./ml-latest-small/movies.csv", sep=",")
ratings = pd.read_csv("./ml-latest-small/ratings.csv", sep=",")
tags = pd.read_csv("./ml-latest-small/tags.csv", sep=",")
links = pd.read_csv("./ml-latest-small/links.csv", sep=",")

# Eliminamos nuelos
movies.dropna(inplace=True)
ratings.dropna(inplace=True)
tags.dropna(inplace=True)
links.dropna(inplace=True)

In [152]:
# Extraemos el año del título
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year
movies.title = movies.title.str[:-7]

In [153]:
'''# Separamos los géneros
movies['genres'] = movies['genres'].str.split('|')
dfx = pd.get_dummies(pd.DataFrame(movies['genres'].tolist()).stack()).sum(level=0)
movies = pd.concat([movies, dfx], axis=1).drop(columns=['genres'])

generos = movies[['movieId', '(no genres listed)', 'Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].copy()'''

"# Separamos los géneros\nmovies['genres'] = movies['genres'].str.split('|')\ndfx = pd.get_dummies(pd.DataFrame(movies['genres'].tolist()).stack()).sum(level=0)\nmovies = pd.concat([movies, dfx], axis=1).drop(columns=['genres'])\n\ngeneros = movies[['movieId', '(no genres listed)', 'Action', 'Adventure',\n       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',\n       'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery',\n       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].copy()"

In [154]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [155]:
# Cambiamos los timestamps de ratings y tags de segundos a años
ratings.timestamp = pd.to_datetime(ratings.timestamp, infer_datetime_format=True)
ratings.timestamp = ratings.timestamp.dt.year

tags.timestamp = pd.to_datetime(tags.timestamp, infer_datetime_format=True)
tags.timestamp = tags.timestamp.dt.year

In [156]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1970
1,1,3,4.0,1970
2,1,6,4.0,1970
3,1,47,5.0,1970
4,1,50,5.0,1970


In [157]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1970
1,2,60756,Highly quotable,1970
2,2,60756,will ferrell,1970
3,2,89774,Boxing story,1970
4,2,89774,MMA,1970


In [158]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [159]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                     for c in combinations(s.split('|'), r=i)))
matriz = tf.fit_transform(movies.genres)
cosine_sim = cosine_similarity(matriz)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])

'''tfidf = TfidfVectorizer()
matriz = tfidf.fit_transform(generos)
cosine_sim = cosine_similarity(matriz)
indices = pd.Series(movies.index, index = movies['title']).drop_duplicates()

tfidf = TfidfVectorizer(stop_words = 'english')
matriz = tfidf.fit_transform(generos)
similitud_coseno = linear_kernel(matriz, matriz)
indices = pd.Series(movies.index, index = movies['title']).drop_duplicates()

tfidf = TfidfVectorizer(stop_words = 'english')
tags['tag'] = tags['tag'].fillna('')
matriz = tfidf.fit_transform(tags['tag'])
similitud_coseno = linear_kernel(matriz, matriz)
indices = pd.Series(movies.index, index = movies['title']).drop_duplicates()'''

"tfidf = TfidfVectorizer()\nmatriz = tfidf.fit_transform(generos)\ncosine_sim = cosine_similarity(matriz)\nindices = pd.Series(movies.index, index = movies['title']).drop_duplicates()\n\ntfidf = TfidfVectorizer(stop_words = 'english')\nmatriz = tfidf.fit_transform(generos)\nsimilitud_coseno = linear_kernel(matriz, matriz)\nindices = pd.Series(movies.index, index = movies['title']).drop_duplicates()\n\ntfidf = TfidfVectorizer(stop_words = 'english')\ntags['tag'] = tags['tag'].fillna('')\nmatriz = tfidf.fit_transform(tags['tag'])\nsimilitud_coseno = linear_kernel(matriz, matriz)\nindices = pd.Series(movies.index, index = movies['title']).drop_duplicates()"

In [160]:
def genre_recommendations(i, M, items, k=10):
    """
    Recommends movies based on a similarity dataframe

    Parameters
    ----------
    i : str
        Movie (index of the similarity dataframe)
    M : pd.DataFrame
        Similarity dataframe, symmetric, with movies as indices and columns
    items : pd.DataFrame
        Contains both the title and some other features used to define similarity
    k : int
        Amount of recommendations to return

    """
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

"def recomendar(title, similitud_coseno = cosine_sim):\n    idx = indices[title]\n    sim_scores = list(enumerate(similitud_coseno[idx]))\n    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n    sim_scores = sim_scores[0:6]\n    movie_inx = [i[0] for i in sim_scores]\n    #if indices[title] != movies['title']:\n    return movies['title'].iloc[movie_inx]"

In [162]:
genre_recommendations('2001: A Space Odyssey', cosine_sim_df, movies[['title', 'genres']])

Unnamed: 0,title,genres
0,A.I. Artificial Intelligence,Adventure|Drama|Sci-Fi
1,The Martian,Adventure|Drama|Sci-Fi
2,Star Trek: Generations,Adventure|Drama|Sci-Fi
3,"20,000 Leagues Under the Sea",Adventure|Drama|Sci-Fi
4,"20,000 Leagues Under the Sea",Action|Adventure|Sci-Fi
5,Close Encounters of the Third Kind,Adventure|Drama|Sci-Fi
6,"Philadelphia Experiment, The",Adventure|Drama|Sci-Fi
7,"Day of the Doctor, The",Adventure|Drama|Sci-Fi
8,Enemy Mine,Adventure|Drama|Sci-Fi
9,Until the End of the World (Bis ans Ende der W...,Adventure|Drama|Sci-Fi
