# Exercícios

## Sistemas de Recomendação

Para esse conjunto de exercícios vamos utilizar novamente o dataset MovieLens 100k que utilizamos em nosso Warmup. Utilizando esse Dataset, implemente as funcionalidades a seguir, podendo agora utilizar as bibliotecas padrões do python (numpy, math, scipy ) para implementar suas soluções. Faça a organização do seu código agora utilizando funções buscando reutilizar seu código.

## Funcionalidade:

1. Implementar o algoritmo de Similaridade do Cosseno;
2. Implementar um Sistema de Recomendações, usando a técnica de Filtros Colaborativos baseados no usuário;
3. Implementar um Sistema de Recomendações, usando a técnica de Filtros Colaborativos baseados na similaridade entre os itens (filmes);
4. Implementar a Medida de Correlação de Pearson como medida de similaridade;


In [1]:
# imports necessários

import pandas as pd
import numpy as np
from scipy import spatial
from numpy import dot
from numpy.linalg import norm
import seaborn as sns
from scipy import stats

### 1. Implementar o algoritmo de Similaridade do Cosseno;

$$ cos\_sim(x, y) = \frac{x \cdot y}{\| x \| \cdot \| y \|} $$

In [2]:
# utilizando scipy 

def similaridade_cosseno_scipy(list1, list2):
    result = 1 - spatial.distance.cosine(list1, list2)
    return result

In [3]:
def similaridade_cosseno_numpy(list1, list2):
    result = dot(list1, list2) / (norm(list1) * norm(list2))
    return result

### 2. Implementar um Sistema de Recomendações, usando a técnica de Filtros Colaborativos baseados no usuário;

In [4]:
# Carregando o dados do arquivo u.data

data = pd.read_csv(r'/home/hub/Downloads/ml-100k/u.data', sep="\t", names=['user_id', 'movie_id', 'rating', 'timestamp'])
data

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [5]:
# Informações sobre os filmes

# duvida sobre o 'encoding', se não utilizar, da error

info_data = pd.read_csv(r'/home/hub/Downloads/ml-100k/u.item', sep="|", encoding='latin-1', names = ['movie_id', 'movie_title' ,'release_date','video_release_date', 'IMDb_URL', 'unknown', 'Action', 
                'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'])
info_data.drop(columns=['video_release_date', 'IMDb_URL'], inplace=True)
info_data

# Em relação aos generos dos filmes, 1 significa True, 0 False.

Unnamed: 0,movie_id,movie_title,release_date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# função para ver as notas de um usuario

def notas_do_usuario(usuario):
  notas_do_usuario = data.query("user_id==%d" % usuario)
  notas_do_usuario = notas_do_usuario[["movie_id", "rating"]].set_index("movie_id")
  return notas_do_usuario

In [7]:
notas_do_usuario(1)

Unnamed: 0_level_0,rating
movie_id,Unnamed: 1_level_1
61,4
189,3
33,4
160,4
20,4
...,...
28,4
172,5
122,3
152,5


In [8]:
# Calcula a similaridade dos cosseno entre um usuario e outro (inserir user_id)
# minimo é a quantidade minima de filmes em comum

def similaridade_de_usuarios(usuario_id1, usuario_id2, minimo = 5):
  notas1 = notas_do_usuario(usuario_id1)
  notas2 = notas_do_usuario(usuario_id2)
  join_notas1_2 = notas1.join(notas2, lsuffix="_esquerda", rsuffix="_direita").dropna()
  
  if(len(join_notas1_2) < minimo):
    return None
  
  similaridade =  similaridade_cosseno_scipy(join_notas1_2['rating_esquerda'], join_notas1_2['rating_direita'])
  return [usuario_id1, usuario_id2, similaridade]

In [9]:
similaridade_de_usuarios(1,5)

[1, 5, 0.9326135855945642]

In [10]:
# Calcula a similaridade entre um user_id e todos os restantes do dataset

def similardade_de_todos(input_id, numero_de_usuarios_a_analisar = None):
  todos_os_usuarios = data['user_id'].unique()
  if numero_de_usuarios_a_analisar:
    todos_os_usuarios = todos_os_usuarios[:numero_de_usuarios_a_analisar]
  similaridades = [similaridade_de_usuarios(input_id, usuario_id) for usuario_id in todos_os_usuarios]
  similaridades = list(filter(None, similaridades))
  similaridades = pd.DataFrame(similaridades, columns = ["input_id", "usuario_id", "similaridades"])
  return similaridades

similardade_de_todos(1).head()

Unnamed: 0,input_id,usuario_id,similaridades
0,1,196,0.887837
1,1,186,0.867442
2,1,22,0.967375
3,1,244,0.955774
4,1,298,0.943073


In [11]:
# Apresenta os ids dos usuarios relacionados com o input_id e o valor da similaridades
# Sugerindo baseado em vários usuários
# parametro k_mais_proximos, pega os 10 usuarios mais similares ao input_id
# numero_de_usuarios_a_analisar é a quantidade de usuarios que será analisado (caso dataset seja muito grande)

def knn(input_id, k_mais_proximos=10, numero_de_usuarios_a_analisar = None):
  similaridades = similardade_de_todos(input_id, numero_de_usuarios_a_analisar = numero_de_usuarios_a_analisar)
  similaridades = similaridades.sort_values("similaridades")
  similaridades = similaridades.set_index("usuario_id").drop(input_id)
  return similaridades.head(k_mais_proximos)

knn(1).head()

Unnamed: 0_level_0,input_id,similaridades
usuario_id,Unnamed: 1_level_1,Unnamed: 2_level_1
589,1,0.638755
35,1,0.668503
502,1,0.703
451,1,0.716028
335,1,0.752164


In [12]:
# apaga a coluna do genero dos quais os filmes não fazem parte

def apaga_genero(dataframe):
    drops = []
    for item in dataframe.columns:
        if dataframe[item].sum() == 0:
            drops.append(item)
    dataframe.drop(columns = drops, inplace=True)
    return dataframe

In [13]:
# Sugere filmes com base no id dos usuarios

def sugere_para(input_id, k_mais_proximos = 10, numero_de_usuarios_a_analisar = None):
  notas_input_id = notas_do_usuario(input_id)
  filmes_inpud_id_ja_viu = notas_input_id.index

  similares = knn(input_id, k_mais_proximos = k_mais_proximos, numero_de_usuarios_a_analisar = numero_de_usuarios_a_analisar)
  usuarios_similares = similares.index
  notas_dos_similares = data.set_index("user_id").loc[usuarios_similares]
  recomendacoes = notas_dos_similares.groupby("movie_id").mean()[["rating"]]
  recomendacoes = recomendacoes.sort_values("rating", ascending=False)
  return apaga_genero(recomendacoes.join(info_data.set_index('movie_id')))


In [14]:
sugere_para(1)

Unnamed: 0_level_0,rating,movie_title,release_date,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1617,5.0,Hugo Pool (1997),01-Jan-1997,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
778,5.0,Don Juan DeMarco (1995),01-Jan-1995,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
331,5.0,"Edge, The (1997)",26-Sep-1997,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
309,5.0,Deceiver (1997),01-Jan-1997,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
873,5.0,Picture Perfect (1997),01-Aug-1997,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,1.0,Wishmaster (1997),01-Jan-1997,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1038,1.0,Switchback (1997),01-Jan-1997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1026,1.0,"Lay of the Land, The (1997)",01-Jan-1997,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
988,1.0,"Beautician and the Beast, The (1997)",07-Feb-1997,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0


In [15]:
# Fazendo tudo dentro de uma classe

class recomend_based_user():
    """
    Sistema de Recomendações, usando a técnica de Filtros Colaborativos baseados no usuário
    """
    def __init__(self, dataframe_1, dataframe_2):
        self.dataset_1 = dataframe_1
        self.dataset_2 = dataframe_2
    
    def similaridade_cosseno_scipy(self, list1, list2):
        result = 1 - spatial.distance.cosine(list1, list2)
        return result

    def notas_do_usuario(self, usuario):
        notas_do_usuario = self.dataset_1.query("user_id==%d" % usuario)
        notas_do_usuario = notas_do_usuario[["movie_id", "rating"]].set_index("movie_id")
        return notas_do_usuario
    
    def similaridade_de_usuarios(self, usuario_id1, usuario_id2, minimo = 5):
        notas1 = notas_do_usuario(usuario_id1)
        notas2 = notas_do_usuario(usuario_id2)
        join_notas1_2 = notas1.join(notas2, lsuffix="_esquerda", rsuffix="_direita").dropna()
  
        if(len(join_notas1_2) < minimo):
            return None
  
        similaridade =  similaridade_cosseno_scipy(join_notas1_2['rating_esquerda'], join_notas1_2['rating_direita'])
        return [usuario_id1, usuario_id2, similaridade]
    
    def similardade_de_todos(self, input_id, numero_de_usuarios_a_analisar = None):
        todos_os_usuarios = self.dataset_1['user_id'].unique()
        if numero_de_usuarios_a_analisar:
            todos_os_usuarios = todos_os_usuarios[:numero_de_usuarios_a_analisar]
        similaridades = [similaridade_de_usuarios(input_id, usuario_id) for usuario_id in todos_os_usuarios]
        similaridades = list(filter(None, similaridades))
        similaridades = pd.DataFrame(similaridades, columns = ["input_id", "usuario_id", "similaridades"])
        return similaridades
    
    def knn(self, input_id, k_mais_proximos=10, numero_de_usuarios_a_analisar = None):
        similaridades = similardade_de_todos(input_id, numero_de_usuarios_a_analisar = numero_de_usuarios_a_analisar)
        similaridades = similaridades.sort_values("similaridades")
        similaridades = similaridades.set_index("usuario_id").drop(input_id)
        return similaridades.head(k_mais_proximos)
    
    def apaga_genero(self, dataframe):
        drops = []
        for item in dataframe.columns:
            if dataframe[item].sum() == 0:
                drops.append(item)
        dataframe.drop(columns = drops, inplace=True)
        return dataframe
    
    def sugere_para(self, input_id, k_mais_proximos = 10, numero_de_usuarios_a_analisar = None):
        notas_input_id = notas_do_usuario(input_id)
        filmes_inpud_id_ja_viu = notas_input_id.index
        similares = knn(input_id, k_mais_proximos = k_mais_proximos, numero_de_usuarios_a_analisar = numero_de_usuarios_a_analisar)
        usuarios_similares = similares.index
        notas_dos_similares = self.dataset_1.set_index("user_id").loc[usuarios_similares]
        recomendacoes = notas_dos_similares.groupby("movie_id").mean()[["rating"]]
        recomendacoes = recomendacoes.sort_values("rating", ascending=False)
        return apaga_genero(recomendacoes.join(self.dataset_2.set_index('movie_id')))
    

In [16]:
rec = recomend_based_user(data, info_data)

In [17]:
rec.sugere_para(1)

Unnamed: 0_level_0,rating,movie_title,release_date,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1617,5.0,Hugo Pool (1997),01-Jan-1997,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
778,5.0,Don Juan DeMarco (1995),01-Jan-1995,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
331,5.0,"Edge, The (1997)",26-Sep-1997,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
309,5.0,Deceiver (1997),01-Jan-1997,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
873,5.0,Picture Perfect (1997),01-Aug-1997,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,1.0,Wishmaster (1997),01-Jan-1997,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1038,1.0,Switchback (1997),01-Jan-1997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1026,1.0,"Lay of the Land, The (1997)",01-Jan-1997,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
988,1.0,"Beautician and the Beast, The (1997)",07-Feb-1997,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0


### 3. Implementar um Sistema de Recomendações, usando a técnica de Filtros Colaborativos baseados na similaridade entre os itens (filmes);

In [22]:
# Para tal tecnica assumimos que rating = 0 significa que o usario não avaliou aquele filme (NaN = 0)
# Sistema de recomendação utilizando a decomposição de valores singulares (SVD)

from numpy.linalg import svd

matrix_1 = data.pivot(index = "user_id",
                   columns = "movie_id",
                   values = "rating").fillna(0)
matrix = matrix_1.values
u, s, vh = svd(matrix, full_matrices=False)


In [23]:
u

array([[-6.58043056e-02,  5.97506439e-03, -6.13255533e-03, ...,
         1.02599859e-03, -3.98164955e-03,  9.01590568e-04],
       [-1.40210432e-02, -4.66260235e-02,  5.25785570e-02, ...,
         1.08597866e-02, -2.04456480e-03, -1.27743484e-03],
       [-5.65798108e-03, -2.56184475e-02,  2.33618292e-02, ...,
         5.16234564e-03,  1.34125699e-02,  1.37991290e-03],
       ...,
       [-7.44452122e-03, -2.50212869e-02,  6.16532475e-03, ...,
         2.63658734e-02,  3.18613301e-03,  6.76975924e-03],
       [-2.40311860e-02,  8.09611360e-03,  2.28873569e-02, ...,
        -4.44753529e-03, -2.58792477e-03, -5.73382835e-03],
       [-4.22420915e-02, -1.09271486e-02, -5.85460445e-02, ...,
         6.50642742e-04, -4.15913263e-03, -6.67710004e-05]])

In [24]:
s

array([640.63362257, 244.83634567, 217.84622472, 159.15359872,
       158.21191449, 145.87261327, 126.57977314, 121.90769976,
       106.8291837 ,  99.74793974,  93.79885965,  93.25844284,
        89.91150168,  84.34178722,  83.81220836,  81.81204105,
        79.07796788,  77.88652669,  76.387996  ,  75.3415951 ,
        73.68235502,  72.80837191,  72.51350545,  71.52749477,
        69.77179735,  69.10881715,  68.8735702 ,  67.94277928,
        67.40829434,  67.06352378,  66.85757418,  65.59270059,
        65.27526042,  64.79965625,  64.44727664,  64.09819141,
        63.91638042,  63.08261122,  62.67586971,  62.23742793,
        62.03574728,  61.77291401,  61.33544177,  61.0632462 ,
        60.56817026,  60.30813928,  59.77166759,  59.51420996,
        59.40675   ,  59.10683763,  58.83667955,  58.53445585,
        58.33802154,  58.1323194 ,  57.41759146,  57.36384311,
        57.30977341,  56.99448748,  56.72636608,  56.239748  ,
        56.17894513,  55.8734678 ,  55.65459359,  55.52

In [25]:
vh

array([[-9.59509371e-02, -3.51795155e-02, -1.99288117e-02, ...,
        -3.03747116e-05, -3.31055915e-04, -3.16852950e-04],
       [-8.72397853e-02, -7.02505798e-03, -2.86181725e-02, ...,
        -4.48134760e-04,  1.05231342e-04,  2.03151884e-04],
       [-1.69737618e-02, -6.25039193e-02, -1.16405039e-02, ...,
         5.33024145e-04, -4.54336533e-04, -2.61400068e-04],
       ...,
       [-1.26681796e-02, -3.34473768e-02,  7.28086206e-03, ...,
        -1.35217677e-02,  1.84916008e-03,  8.40358953e-04],
       [ 1.77118452e-02,  3.40698283e-02,  2.78896152e-02, ...,
         2.32835943e-02, -6.62201648e-03, -1.91406048e-03],
       [-1.05201630e-02,  1.58290193e-02,  2.53698839e-02, ...,
        -4.41196791e-02, -1.05796040e-02, -1.52428879e-02]])

In [40]:
def similaridade_item(movie_id):
    lista_similarity = []
    sim_col = []
    for col in range(1,vh.shape[1]):
        similarity = similaridade_cosseno_scipy(vh[:,(movie_id-1)], vh[:,col])
        lista_similarity.append(similarity)
        sim_col.append(col)
    d = {'movie_id': sim_col, 'similarity': lista_similarity}
    df = pd.DataFrame(data=d)
    result = df.set_index('movie_id').sort_values("similarity", ascending=False)
    return result.join(info_data.set_index('movie_id')).drop(['similarity'], axis = 1)


In [41]:
similaridade_item(1).head()

Unnamed: 0_level_0,movie_title,release_date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1615,Warriors of Virtue (1997),02-May-1997,0,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1241,"Van, The (1996)",27-Jun-1997,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
972,Passion Fish (1992),01-Jan-1992,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1653,Entertaining Angels: The Dorothy Day Story (1996),27-Sep-1996,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1310,"Walk in the Sun, A (1945)",01-Jan-1945,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4. Implementar a Medida de Correlação de Pearson como medida de similaridade;

$$ corr = \frac{1}{(n-1)} \cdot \sum \left( \frac{x_i - \bar{x}}{s_x} \right) \left(\frac{y_i - \bar{y}}{s_y} \right) $$

In [42]:
def corr_p(x, y):
    n = len(x)
    x_m = x - np.mean(x)
    x_m = x_m / np.std(x, ddof=1)
    y_m = y - np.mean(y)
    y_m = y_m / np.std(y, ddof=1)
    return (x_m * y_m).sum() / (n - 1)