<h1><font color='blue'>Sistemas de Recomendação</font></h1>
<h2><b>Trabalho Final - Estudo de caso em recomendação</b></h2>


---

## **Carregamento e tratamento inicial dos dados**

Chamadas das bibliotecas que serão usadas

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from urllib.request import urlretrieve
import zipfile

import re

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

from surprise import KNNWithMeans
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBaseline
from surprise import SlopeOne
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

Baixar o dataset movielens

In [None]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()

In [None]:
# def amostra(file):
#     i = 0
#     with open(file) as f:
#         for linha in f:
#             linha = linha.strip() 
#             print(linha)
#             i += 1
#             if i >= 5: return
            
# amostra('ml-100k/u.info')

Recuperar o dataset de usuarios e transformar em um dataframe

In [None]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
df_users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1', low_memory=False)
df_users.head()

Analise da distribuição dos dados do dataset de usuarios

In [None]:
# sns.countplot(x='sex', data=df_users); 
# plt.title('Distribuição dos usuários por sexo');  
# plt.xlabel('Sexo')
# plt.ylabel('Total')

In [None]:
# sns.histplot(data=df_users['age'], bins=7)
# plt.title('Distribuição dos usuários por idade');  
# plt.xlabel('Idade')
# plt.ylabel('Total')
# plt.show()

Recuperar o dataset de classificacao de filmes (ratings) e transformar em um dataframe

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
df_ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1', low_memory=False)
df_ratings.head()

Analise da distribuição dos dados do dataset de classificacao de filmes

In [None]:
# sns.histplot(data=df_ratings['rating'], bins=5)
# plt.title('Distribuição das classificações dos usuários');  
# plt.xlabel('Classificação')
# plt.ylabel('Total')
# plt.show()

Recuperar o dataset de filmes e transformar em um dataframe

In [None]:
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', \
          'unknown', 'action', 'adventure', 'animation' , 'childrens', 'comedy', \
          'crime', 'documentary', 'drama', 'fantasy', 'filmnoir', 'horror', 'musical', \
          'mystery', 'romance', 'scifi', 'thriller', 'war', 'western' ]
          
df_movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, encoding='latin-1', low_memory=False)

df_movies.drop(columns=['video_release_date', 'imdb_url'], inplace=True)

df_movies.head()

Tratamento do dataset de filmes

Concatenar todos generos dos filmes em string e adicionar em uma coluna

In [None]:
generos = df_movies.columns[3:]
valores = []
for linha in df_movies[ generos ].values:
    valores.append( [ gen.lower() for val,gen in zip(linha, generos) if val] )

Analise da distribuição dos dados do dataset de generos de filmes

In [None]:
# all_genres = [s.split("|") for s in df_movies[df_movies.genres.notnull()].genres]
# genres = [item for l in all_genres for item in l ]
# unique_genres = set(genres)

# print (f"Total de {len(unique_genres)} gêneros e {len(genres)} ocorrências.")
# pd.Series(genres).value_counts().plot(kind='bar', figsize=(10, 3))
# plt.title("Total de filmes por gênero")
# plt.ylabel("Total de filmes")
# plt.xlabel("Gênero")
# plt.show()

In [None]:
df_movies['genres'] = valores
df_movies['genres'] = df_movies['genres'].apply(lambda x: '|'.join(x) if len(x) > 0 else 'unknown')

Remover ano do titulo

In [None]:
def update_title(title):
    regex = r"(?: \(\d{4}\))$"
    return re.sub(regex, '', title)

df_movies['title'] = df_movies['title'].apply(update_title)

Tratar o ano de lançamento dos filmes

In [None]:
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'], errors='coerce')

df_movies['year'] = df_movies['release_date'].apply(lambda x: str(x).split('-')[0])

Analise da distribuição dos anos de lançamento dos filmes do dataset

In [None]:
# years = df_movies[df_movies.year.notnull()].year 
# print (f"Filmes por ano de {min(years)} até {max(years)}")
# pd.Series(years).value_counts().sort_index().plot(kind='bar', figsize=(30, 5))
# plt.title("Distribuição de filmes por ano")
# plt.ylabel("Total de filmes")
# plt.xlabel("Ano")
# plt.show()

Remover colunas desnecessarias para a analise

In [None]:
columns_to_drop = ['release_date'] + list(generos)
df_movies.drop(columns=columns_to_drop, axis=1, inplace=True)

Criação das colunas 'Contagem' e 'Media' das votações

In [None]:
df_movrat = df_ratings.groupby('movie_id').agg(vote_count=('rating', 'count'), vote_average=('rating', 'mean'))
df_movrat = df_movrat.reset_index()

df_movie_ratings = pd.merge(df_movies, df_movrat, on=['movie_id'])

df_movie_ratings.head()

Top 10 filmes mais votados

In [None]:
df_movie_ratings.sort_values(by=['vote_count', 'vote_average'], ascending = [False, False]).head(10)

Top 10 filmes melhores classificados

In [None]:
df_movie_ratings.sort_values(by=['vote_average','vote_count'], ascending=[False, False]).head(10)

## **Recomendação simples**

In [None]:
documentos = []
for title, year, genres in df_movie_ratings[['title', 'year', 'genres']].values:
    lista = genres.split('|')
    documentos.append(title.lower() + ' ' + year + ' ' + ' '.join(lista).lower())

documentos[:5]

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, stop_words=stopwords.words('english'))
tfidf_matrix = tfidf.fit_transform(documentos)

from sklearn.metrics.pairwise import cosine_similarity

sims = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_tfidf_scores(movieId=None):
    if movieId is not None:
        position = df_movie_ratings.index.get_loc(movieId)
        sims = cosine_similarity(tfidf_matrix[position], tfidf_matrix)[0]
    else:
        sims = np.zeros(df_movie_ratings.shape[0])
    aux = df_movie_ratings.copy()
    aux['tfidf_score'] = sims
    return aux['tfidf_score']

# get_tfidf_scores(1)

In [None]:
def get_recommendations(movieId=None, topN=10):

    # Seleciona os itens mais similares a partir da recomendação de conteúdo ou os vizinhos para a recomendação colaborativa
    content_scores = get_tfidf_scores(movieId=movieId-1)
    aux = df_movie_ratings.loc[content_scores.index][['movie_id','title','genres','vote_count','vote_average']]
    aux['score'] = content_scores
    aux = aux.set_index('movie_id')
    aux = aux[ ~aux.index.isin([movieId]) ]

    return aux.sort_values(by=['score','vote_average'], ascending=False).head(topN)

In [None]:
get_recommendations(1)

Calculo do IMDB Score

In [None]:
# Pegar a quantidade mínima de votos dos filmes com mais votos que 75% dos filmes 
m = df_movie_ratings['vote_count'].quantile(0.75)

# Computar o C - classificação média geral de todos os filmes 
C = df_movie_ratings['vote_average'].mean()

# Calcular o 'IMDB weighted rating' para cada filme
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']

    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

df_movie_ratings['imdb_score'] = df_movie_ratings.apply(weighted_rating, axis=1)

df_movie_ratings.head(10)

In [None]:
import math 

MIN_RATING = 0.0
MAX_RATING = float(math.ceil(df_ratings.rating.max()))

print(MIN_RATING, MAX_RATING)

In [None]:
def get_popularity_scores():
    # normalizar o score entre 0 e 1 
    return df_movie_ratings['imdb_score'] / MAX_RATING

get_popularity_scores()

# **Implementação da recomendação por conteudo - Content Based**

## **Cria user_profile**

In [None]:
all_genres = [s.split("|") for s in df_movie_ratings[df_movie_ratings.genres.notnull()].genres]
unique_genres = sorted(set([item for l in all_genres for item in l ]))

df_movie_genres = df_movie_ratings.copy(deep=True)
df_movie_genres['genres'] = df_movie_genres.genres.str.split('|')
for g in unique_genres:
    df_movie_genres[g] = np.zeros(len(df_movie_genres))

for index, row in df_movie_genres.iterrows():
    for genre in row['genres']:
        df_movie_genres.at[index, genre] = 1

# df_movie_genres.drop(['title', 'genres', 'year', 'vote_count', 'vote_average', 'imdb_score', 'imdb_url', 'video_release_date'], axis=1, inplace=True)
df_movie_genres.drop(['title', 'genres', 'year', 'vote_count', 'vote_average', 'imdb_score'], axis=1, inplace=True)
df_movie_genres.set_index('movie_id', drop=True, inplace=True)
df_movie_genres.head(3)

In [None]:
df_movie_genres.shape

In [None]:
df_user = df_ratings[ df_ratings['user_id'] == 196 ]

df_user.drop(columns=['user_id', 'unix_timestamp'], inplace=True)
df_user.set_index('movie_id', drop=True, inplace=True)
df_user.head(3)

df2_user = df_movie_genres[ df_movie_genres.index.isin(df_user.index) ].copy()

wgm = df_user.values * df2_user

pd.DataFrame(wgm.sum() / wgm.values.sum()).T

In [None]:
def create_user_profile(user=None, user_ratings=None):

    if ((user is None) & (user_ratings is None)) | ((user is not None) & (user_ratings is not None)):
        raise Exception('Necessário informar um usuário ou classificações')
    
    if user is not None:
        user_ratings = df_ratings[ df_ratings['user_id'] == user ][['movie_id', 'rating']]

    # Valida se todas as classificações são de filmes válidos
    user_ratings = user_ratings[ user_ratings.index.isin(df_movie_genres.index) ]
    
    # Calcular o peso de cada gênero para o usuário ou user_ratings
    userdf = df_movie_genres[ df_movie_genres.index.isin(user_ratings.index) ].copy()
    up = userdf.T.dot(user_ratings.rating) /  sum(userdf.T.dot(user_ratings.rating))
    return up

In [None]:
pd.DataFrame(create_user_profile(user=196)).T

In [None]:
df_user_profile = create_user_profile(user_ratings=df_user)
pd.DataFrame(df_user_profile).T

In [None]:
import seaborn as sns

def plot_user_profile(user_profile):
    genres = pd.DataFrame(zip(list(user_profile), sorted(user_profile.keys())), columns=['pesos', 'generos'])
    sns.set(rc={'figure.figsize':(20,5)})
    ax = sns.barplot(x="generos", y="pesos", data=genres)


<h3> Apresentacao dos resultados

In [None]:
plot_user_profile(create_user_profile(user=196))

In [None]:
plot_user_profile(df_user_profile)

In [None]:
def get_movie_scores(user_profile):
    movie_weights = df_movie_ratings.copy()
    movie_weights.set_index('movie_id', drop=True, inplace=True)
    # Calcular o score de cada filme
    movie_weights = movie_weights.join(pd.DataFrame(df_movie_genres.dot(user_profile), columns=['user_profile_score']))
    return movie_weights

get_movie_scores(df_user_profile).head()

In [None]:
def get_user_profile_scores(user, best=False):

    user_profile = create_user_profile(user=user)
    dot = df_movie_genres.dot(user_profile)

    if best: # considera os gêneros preferidos do usuário
        threshold = user_profile.quantile(0.75)       
        best = user_profile[ user_profile > threshold ].index
        best_genres = df_movie_genres.copy()
        for genre in df_movie_genres.columns:
            if genre not in best:
                best_genres[genre] = 0.0
        norma = df_movie_genres.sum(axis=1) #- best_genres.sum(axis=1)
        norma = norma.apply(lambda x: x if x > 0 else 1)
        dot /= norma
        dot += best_genres.sum(axis=1) / best_genres.shape[1]

    return dot

In [None]:
# get_user_profile_scores(user=1) 

In [None]:
# get_user_profile_scores(user=1, best=True)

In [None]:
aux = df_movie_ratings.copy()

aux['up_scores'] = get_user_profile_scores(user=1) 
aux['up_scores_best'] = get_user_profile_scores(user=1, best=True) 

In [None]:
# aux.sort_values(by='up_scores', ascending=False).head(10)W

In [None]:
# aux.sort_values(by='up_scores_best', ascending=False).head(10)

## **Cria indice TF-IDF**

In [None]:
documentos = []

for title, year, genres in df_movie_ratings[['title', 'year', 'genres']].values:
    documentos.append(title.lower() + ' ' + year + ' ' + ' '.join(genres.split('|')).lower())

# documentos[:5]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

tfidf = TfidfVectorizer(sublinear_tf=True, stop_words=stopwords.words('english'))
tfidf_matrix = tfidf.fit_transform(documentos)

# print(pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names_out()).iloc[:10])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

sims = cosine_similarity(tfidf_matrix, tfidf_matrix)
# pd.DataFrame(sims).iloc[:,:5].head()

In [None]:
def get_tfidf_scores(movieId=None):
    if movieId is not None:
        position = df_movie_ratings.index.get_loc(movieId)
        sims = cosine_similarity(tfidf_matrix[position], tfidf_matrix)[0]
    else:
        sims = np.zeros(df_movie_ratings.shape[0])
    aux = df_movie_ratings.copy()
    aux['tfidf_score'] = sims
    return aux['tfidf_score']

In [None]:
# get_tfidf_scores(1)

In [None]:
def get_content_scores(user=None, movieId=None, topN=99999999, weights=[0.5,0.5]):

    watched_movies = list(df_ratings.query('user_id == ' + str(user))['movie_id'].values)
    if movieId not in watched_movies:
        watched_movies.append(movieId)

    up_scores = get_user_profile_scores(user=user, best=True)

    # Se informou o filme, pondera os scores tfidf e do user profile
    if movieId is not None:
        tfidf_scores = get_tfidf_scores(movieId)
        avg_scores = np.sum( [weights[0]*up_scores, weights[1]*tfidf_scores], axis=0)
    # Se não informou o filme, retorna os scores do user profile
    else:
        avg_scores = up_scores

    scores = pd.DataFrame(up_scores, columns=['user_profile_score'])
    scores['score'] = avg_scores

    return scores[ ~scores.index.isin(watched_movies) ].sort_values(by=['score'], ascending=False)['score'].head(topN)

In [None]:
# get_content_scores(user=1)

In [None]:
# get_content_scores(user=1, movieId=1)

# **Implementação da recomendação Colaborativa com a biblioteca surprise- Collaborative Based**

In [None]:
reader = Reader(rating_scale=(0, 5))
df_data = Dataset.load_from_df(df_ratings[['user_id', 'movie_id', 'rating']], reader)

trainset = df_data.build_full_trainset()
testset = trainset.build_testset()

ausentes = trainset.build_anti_testset()

In [None]:
knn = KNNBaseline( sim_options = {'name': 'pearson_baseline', 'user_based': False} )
knn.fit(trainset)

In [None]:
def get_itens_vizinhos(movieId, k=10):
    iid = trainset.to_inner_iid(movieId)
    aux = knn.get_neighbors(iid,k)
    return [knn.trainset.to_raw_iid(inner_id) for inner_id in aux]
  
get_itens_vizinhos(1)

In [None]:
# df_movies.loc[ [1] + get_itens_vizinhos(1) ]

In [None]:
# knn.sim

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
sims_itens = scaler.fit_transform(knn.sim)

def get_similaridades(movieId):
    iid = trainset.to_inner_iid(movieId)
    sims = sims_itens[iid]
    ids = []
    for i,s in enumerate(sims):
        if s < 0: s = 0.0
        ids.append( (trainset.to_raw_iid(i), s) )
    ids = sorted(ids, key=lambda x: x[0])
    scores = [x[1] for x in ids]
    return pd.Series(scores, index=df_movies.index)

get_similaridades(1)[:5]

In [None]:
benchmark = []

metodos = [SVD(), SlopeOne(), NMF(), KNNBaseline(), KNNWithMeans()]

for algoritmo in metodos:
    results = cross_validate(algoritmo, df_data, measures=['RMSE'], cv=3, verbose=True)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algoritmo).split(' ')[0].split('.')[-1]], index=['algoritmo']))
    benchmark.append(tmp)

pd.DataFrame(benchmark).set_index('algoritmo').sort_values('test_rmse')

In [None]:
param_grid = { 'n_factors': [5, 15, 50], 'n_epochs': [5, 15, 50] }

gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=3)
gs.fit(df_data)

print('====>', gs.best_estimator['rmse'])

best_model = gs.best_estimator['rmse']
best_model.fit(trainset)

# **Implementação de recomendação hibrida com abordagem mixed**

In [None]:
df_movie_ratings.head(3)

In [None]:
def get_mixed_hybrid_recommendation(user=None, topN=10):

    # Desconsiderar os filmes que o usuário já assistiu
    watched_movies = list(df_ratings.query('user_id == ' + str(user))['movie_id'].values)

    # Calcular o score baseado no imdb
    aux_pop = df_movie_ratings[['title','genres','year','imdb_score']].copy()
    aux_pop['score'] = aux_pop['imdb_score'] / MAX_RATING
    aux_pop.drop(['imdb_score'], axis=1, inplace=True)
    aux_pop['source'] = 'imdb'

    # Calcular o score baseado em conteudo
    aux_cont = df_movie_ratings[['title','genres','year']].copy()
    content_scores = get_content_scores(user=user)
    aux_cont['score'] = content_scores
    aux_cont['source'] = 'content'
    
    # Calcular o score baseado em colaboração
    aux_colab = df_movie_ratings[['title','genres','year']].copy()
    aux_colab = aux_colab.reset_index()
    aux_colab.rename(columns={'index': 'movie_id'}, inplace=True)
    aux_colab['score'] = aux_colab['movie_id'].apply(lambda x: best_model.predict(user, x).est / MAX_RATING) - (np.random.rand()/4.2)
    aux_colab = aux_colab.set_index('movie_id')
    aux_colab['source'] = 'collaborative'

    aux = pd.concat([aux_pop, aux_cont, aux_colab])

    return aux[['title','genres','score','source']]\
            .sort_values(by=['score'], ascending=False)\
            .head(topN)

get_mixed_hybrid_recommendation(user=1)