# 1. Importar librerías

In [77]:
import numpy as np
import pandas as pd
import sqlite3 as sql
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact ## para análisis interactivo
from sklearn import neighbors ### basado en contenido un solo producto consumido
import joblib
from sklearn.preprocessing import MinMaxScaler

from surprise import Reader, Dataset
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split

# 2. Importar datos

In [2]:
conn = sql.connect('data/db_movies')
cur = conn.cursor()

In [3]:
cur.execute('select name from sqlite_master where type = "table"')
cur.fetchall()

[('ratings',),
 ('movies',),
 ('movies2',),
 ('ratings2',),
 ('movies_rating',),
 ('genres',)]

# 3. Sistemas basados en popularidad

### 3.1. Top 10 películas más vistas

In [17]:
query = '''
SELECT title,
        avg(rating) AS rating_prom,
        count(*) AS view_num
FROM movies_rating
GROUP BY movieId
ORDER BY view_num DESC
LIMIT 10;
'''
pd.read_sql(query, conn)

Unnamed: 0,title,rating_prom,view_num
0,Forrest Gump (1994),4.164134,329
1,"Shawshank Redemption, The (1994)",4.429022,317
2,Pulp Fiction (1994),4.197068,307
3,"Silence of the Lambs, The (1991)",4.16129,279
4,"Matrix, The (1999)",4.192446,278
5,Star Wars: Episode IV - A New Hope (1977),4.231076,251
6,Jurassic Park (1993),3.75,238
7,Braveheart (1995),4.031646,237
8,Terminator 2: Judgment Day (1991),3.970982,224
9,Schindler's List (1993),4.225,220


### 3.2. 10 películas mejor calificadas (calificadas al menos 30 veces).

In [19]:
query = '''
SELECT title,
        avg(rating) AS rating_prom,
        count(*) AS view_num
FROM movies_rating
WHERE rating >= 1.0
GROUP BY movieId
HAVING view_num >= 30
ORDER BY rating_prom DESC
LIMIT 10;
'''
pd.read_sql(query, conn)

Unnamed: 0,title,rating_prom,view_num
0,"Shawshank Redemption, The (1994)",4.429022,317
1,Patton (1970),4.354839,31
2,Fight Club (1999),4.325581,215
3,Lawrence of Arabia (1962),4.3,45
4,"Bridge on the River Kwai, The (1957)",4.290698,43
5,"Godfather, The (1972)",4.289062,192
6,12 Angry Men (1957),4.281818,55
7,Casablanca (1942),4.277778,99
8,Cool Hand Luke (1967),4.27193,57
9,Dr. Strangelove or: How I Learned to Stop Worr...,4.268041,97


### 3.3. Top 10 películas más vistas por decada de estreno

In [25]:
query = '''
SELECT DISTINCT year as year
FROM movies_rating
ORDER BY year DESC
'''
years = pd.read_sql(query, conn)

In [26]:
decadas = [str(d) + '-' + str(d + 10) for d in range(1900, 2020, 10)]
years['decada'] = pd.cut(years.year.astype(int), len(decadas), labels = decadas)

In [38]:
def top10_dec_est(Decada):
    d = Decada.split('-')
    query = '''
    SELECT title,
        avg(rating) AS rating_prom,
        count(*) AS view_num
    FROM movies_rating
    WHERE year >= "{}" and year < "{}"
    GROUP BY movieId
    ORDER BY view_num DESC
    LIMIT 10;
    '''.format(d[0], d[1])
    return pd.read_sql(query, conn)

interact(top10_dec_est, Decada = decadas)

interactive(children=(Dropdown(description='Decada', options=('1900-1910', '1910-1920', '1920-1930', '1930-194…

<function __main__.top10_dec_est(Decada)>

### 3.4. Top 10 películas mejor calificadas por decada de estreno (calificadas al menos 30 veces).

In [39]:
def top10_rating_dec_est(Decada):
    d = Decada.split('-')
    query = '''
    SELECT title,
        avg(rating) AS rating_prom,
        count(*) AS view_num
    FROM movies_rating
    WHERE year >= "{}" and year < "{}" and rating >= 1.0
    GROUP BY movieId
    HAVING view_num >= 30
    ORDER BY rating_prom DESC
    LIMIT 10;
    '''.format(d[0], d[1])
    return pd.read_sql(query, conn)

interact(top10_rating_dec_est, Decada = decadas)

interactive(children=(Dropdown(description='Decada', options=('1900-1910', '1910-1920', '1920-1930', '1930-194…

<function __main__.top10_rating_dec_est(Decada)>

### 3.5. Top 10 películas más vistas el último mes

In [67]:
query = '''
SELECT date
FROM movies_rating
ORDER BY date DESC
LIMIT 1;
'''
ultimo_mes = pd.read_sql(query, conn).date.values[0].split('-')[1]
ultimo_anio = pd.read_sql(query, conn).date.values[0].split('-')[0]
ultimo_anio, ultimo_mes

('2018', '09')

In [75]:
query = '''
SELECT movieId, title,
        avg(rating) as rating_prom,
        count(movieId) as views_num
FROM movies_rating
WHERE strftime('%m', date) = "{}" and strftime('%Y', date) == "{}"
GROUP BY movieId
ORDER BY views_num DESC
LIMIT 10;
'''.format(ultimo_mes, ultimo_anio)
pd.read_sql(query, conn)

Unnamed: 0,movieId,title,rating_prom,views_num
0,187593,Deadpool 2 (2018),3.333333,3
1,122906,Black Panther (2017),4.0,3
2,68358,Star Trek (2009),4.0,3
3,187595,Solo: A Star Wars Story (2018),3.5,2
4,183897,Isle of Dogs (2018),3.75,2
5,179401,Jumanji: Welcome to the Jungle (2017),3.25,2
6,177765,Coco (2017),4.5,2
7,168250,Get Out (2017),3.5,2
8,164179,Arrival (2016),3.75,2
9,148626,"Big Short, The (2015)",4.75,2


### 3.6. Top 10 películas más vistas el último año

In [76]:
query = '''
SELECT movieId, title,
        avg(rating) as rating_prom,
        count(movieId) as views_num
FROM movies_rating
WHERE strftime('%Y', date) == "{}"
GROUP BY movieId
ORDER BY views_num DESC
LIMIT 10;
'''.format(ultimo_anio)
pd.read_sql(query, conn)

Unnamed: 0,movieId,title,rating_prom,views_num
0,2571,"Matrix, The (1999)",4.184211,19
1,122916,Thor: Ragnarok (2017),3.916667,18
2,79132,Inception (2010),4.25,18
3,7153,"Lord of the Rings: The Return of the King, The...",4.083333,18
4,356,Forrest Gump (1994),4.147059,17
5,8961,"Incredibles, The (2004)",3.78125,16
6,5952,"Lord of the Rings: The Two Towers, The (2002)",4.25,16
7,122904,Deadpool (2016),3.866667,15
8,58559,"Dark Knight, The (2008)",4.233333,15
9,4993,"Lord of the Rings: The Fellowship of the Ring,...",4.266667,15


### 3.7. Top 10 películas más vistas por género

In [34]:
query = '''
SELECT `Género`
FROM genres
'''
genres = pd.read_sql(query, conn)

In [36]:
def top10_views_genre(Genre):
    query = '''
    SELECT title,
            avg(rating) as rating_prom,
            sum({}) as views_num
    FROM movies_rating
    GROUP BY movieId
    ORDER BY views_num DESC
    LIMIT 10;
    '''.format(Genre)
    return pd.read_sql(query, conn)

interact(top10_views_genre, Genre = list(genres['Género']))

interactive(children=(Dropdown(description='Genre', options=('Drama', 'Comedy', 'Action', 'Thriller', 'Adventu…

<function __main__.top10_views_genre(Genre)>

### 3.8. Top 10 películas mejor calificadas por género (calificadas al menos 30 veces).

In [37]:
def top10_rating_genre(Genre):
    query = '''
    SELECT title,
            avg(rating) as rating_prom,
            sum({}) as views_num
    FROM movies_rating
    WHERE rating >= 1.0
    GROUP BY movieId
    HAVING views_num >= 30
    ORDER BY rating_prom DESC
    LIMIT 10;
    '''.format(Genre)
    return pd.read_sql(query, conn)

interact(top10_rating_genre, Genre = list(genres['Género']))

interactive(children=(Dropdown(description='Genre', options=('Drama', 'Comedy', 'Action', 'Thriller', 'Adventu…

<function __main__.top10_rating_genre(Genre)>

# 4. Sistema de recomendación basado en contenido

### 4.1. KNN una sola película vista.

Importar base de datos solo de películas

In [13]:
movies = pd.read_sql('SELECT * FROM movies2;', conn)

Escalar la variable año

In [14]:
sc = MinMaxScaler()
movies_std = movies.drop(['movieId', 'title'], axis = 1)
movies_std[['year']] = sc.fit_transform(movies_std[['year']])

Modelo con 11 vecinos más cercanos

In [15]:
model = neighbors.NearestNeighbors(n_neighbors = 11, metric='cosine')
model.fit(movies_std)
dist, idlist = model.kneighbors(movies_std)

distancias = pd.DataFrame(dist)
id_list = pd.DataFrame(idlist)

Sistema de recomendación

In [16]:
def MoviesRecommender(movies_name = np.sort(list(movies['title'].value_counts().index))):
    movies_list_name = []
    movies_id = movies[movies['title'] == movies_name].index
    movies_id = movies_id[0]
    for newid in idlist[movies_id]:
        movies_list_name.append(movies.loc[newid].title)
    df = pd.DataFrame()
    df['Movie'] = movies_list_name
    df2 = df.drop(df[df['Movie'] == movies.loc[movies_id].title].index[0])
    return df2


print(interact(MoviesRecommender))

interactive(children=(Dropdown(description='movies_name', options=("'71 (2014)", "'Hellboy': The Seeds of Crea…

<function MoviesRecommender at 0x0000029F306CCEA0>


### 4.2. KNN todas las películas vistas por el usuario

In [79]:
query = '''
select distinct (userId) as user_id
from movies_rating
'''
usuarios = pd.read_sql(query,conn)

In [93]:
def recomendar(user_id = np.sort(list(usuarios['user_id'].value_counts().index))):
    
    ###seleccionar solo los ratings del usuario seleccionado
    query = '''
    SELECT *
    FROM movies_rating
    WHERE userId = {} and rating >= 1.0;
    '''.format(user_id)
    ratings = pd.read_sql(query, conn)
    
    ###convertir ratings del usuario a array
    l_movies_r = ratings['movieId'].to_numpy()
    
    ###agregar la columna de movieId y título de la película a dummie para filtrar y mostrar nombre
    movies_std[['movieId','title']] = movies[['movieId','title']]
    
    ### filtrar películas calificados por el usuario
    movies_r = movies_std[movies_std['movieId'].isin(l_movies_r)]
    
    ## eliminar columna nombre e movieId
    movies_r = movies_r.drop(columns=['movieId','title'])
    movies_r["indice"] = 1 ### para usar group by y que quede en formato pandas tabla de centroide
    ##centroide o perfil del usuario
    centroide = movies_r.groupby("indice").mean()
    
    
    ### filtrar películas no leídos
    movies_nr = movies_std[~movies_std['movieId'].isin(l_movies_r)]
    ## eliminbar nombre e movieId
    movies_nr = movies_nr.drop(columns=['movieId','title'])
    
    ### entrenar modelo 
    model=neighbors.NearestNeighbors(n_neighbors=11, metric='cosine')
    model.fit(movies_nr)
    dist, idlist = model.kneighbors(centroide)
    
    ids = idlist[0] ### queda en un array anidado, para sacarlo
    recomend_b = movies.loc[ids][['title','movieId']]
    leidos = movies[movies['movieId'].isin(l_movies_r)][['title','movieId']]
    
    return recomend_b

In [92]:
print(interact(recomendar))

interactive(children=(Dropdown(description='user_id', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …

<function recomendar at 0x0000029F37934900>


# 5. Filtros colaborativos

## Obtener datos

In [None]:
# Se filtran los mayores o iguales a 1 ya que la escala es de 1 a 5 y los que no estan calificados vienen por defecto con 0.5
df=pd.read_sql('select * from movies_rating where rating>=1', conn)

### Definir escala

In [None]:
reader = Reader(rating_scale=(1,5))

### Seleccionar columnas

In [None]:
df

In [None]:
ratings=df[['userId','movieId','rating']]
ratings

### Leer datos con surprise

In [None]:
data   = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)

### Definir modelos a probar

In [None]:
models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

In [None]:
model=models[1]
for model in models:
 
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [None]:
performance_df = pd.DataFrame.from_dict(results).T
performance_df.sort_values(by='RMSE')

Se elige el KNN Basic ya que es el que tiene mejores metricas en MAE y RMSE

### Definir grilla de busqueda

In [None]:
param_grid = { 'sim_options' : {'name': ['msd','cosine'], \
                                'min_support': [40,30,20,10,5,2], \
                                'user_based': [False, True]}
             }

In [None]:
gridsearchKNNBasic = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], \
                                      cv=2, n_jobs=-1)
                                    
gridsearchKNNBasic.fit(data)

In [None]:
gridsearchKNNBasic.best_params["rmse"]

In [None]:
gridsearchKNNBasic.best_score["rmse"]

In [None]:
gs_model=gridsearchKNNBasic.best_estimator['rmse'] 

### Ajustar predicciones

In [None]:
trainset = data.build_full_trainset()
model=gs_model.fit(trainset)
predset = trainset.build_anti_testset()
predictions = gs_model.test(predset)

### Crear Dataset con las peliculas que no han sido vistas por cada usuario y su calificación predicha

In [None]:
predictions_df = pd.DataFrame(predictions) 
predictions_df.shape
predictions_df.head()
predictions_df['r_ui'].unique() ### promedio de ratings
predictions_df.sort_values(by='est',ascending=False)

### Definir funcion para mostrar las recomendaciones de cada usuario

In [None]:
def recomendaciones(user_id = np.sort(list(usuarios['user_id'].value_counts().index)),n_recomend=10):
    
    predictions_userID = predictions_df[predictions_df['uid'] == int(user_id)].\
                    sort_values(by="est", ascending = False).head(n_recomend)

    rec = predictions_userID[['iid','est']]
    
    recomendados=pd.merge(movies[['movieId','title']],rec,left_on='movieId', right_on='iid', how='right')

    return(recomendados[['title','est']])

In [None]:
print(interact(recomendaciones))