# Movies Recomendation 

In [1]:
# Importando bibliotecas que serão utilizadas

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
# Importando dataframes que serão utilizados

df_movies = pd.read_csv(r"G:\Meu Drive\My Repositories\Datasets_for_data_science\df's-db's-ds's\databases\datasets\ml-latest-small\\movies.csv")
df_ratings = pd.read_csv(r"G:\Meu Drive\My Repositories\Datasets_for_data_science\df's-db's-ds's\databases\datasets\ml-latest-small\\ratings.csv")
#df_tags = pd.read_csv(r"G:\Meu Drive\My Repositories\Datasets_for_data_science\df's-db's-ds's\databases\datasets\ml-latest-small\\tags.csv")

In [3]:
# Alterando o dataframe para melhor uso 

df_movies = df_movies.set_index(df_movies['movieId'])
df_movies = df_movies.drop(columns = ['movieId'])
df_movies['mean_ratings'] = df_ratings.groupby('movieId').mean()['rating']
df_movies['total_rates'] = df_ratings['movieId'].value_counts()

df_ratings = df_ratings.set_index('userId')

In [4]:
# Função para calcular a distância entre dois pontos

def vetor_dist(a,b):
  return np.linalg.norm(a - b)

In [5]:
# Função para capturar as notas de um determinado usuário

def user_grade(user_id):
    user_grades = df_ratings.query(f'userId == {user_id}')
    user_grades = user_grades[['movieId', 'rating']].set_index('movieId')

    return user_grades

In [6]:
# Função para calcular a distância entre dois usuários

def users_dist(user_id_01, user_id_02, min = 5):
  user_grade_01 = user_grade(user_id_01)
  user_grade_02 = user_grade(user_id_02)
  grade = user_grade_01.join(user_grade_02, lsuffix = f'_{user_id_01}', rsuffix = f'_{user_id_02}').dropna()
  dist = vetor_dist(grade[f'rating_{user_id_01}'], grade[f'rating_{user_id_02}'])
  
  if (len(grade) < min):
    return [user_id_01, user_id_02, None]
    
  return [user_id_01, user_id_02, dist]

In [7]:
# Função para calcular a distância de um usuário com todos os outros

def all_users_dist(user_id):
  users = df_ratings.index.unique()
  dists = [users_dist(user_id, user) for user in users]
  df_users_dist = pd.DataFrame(dists, columns = ['Principal Id', 'Index', 'Distance']).set_index('Index')
  df_users_dist = df_users_dist.drop(1)
  
  return df_users_dist

In [8]:
all_users_dist(1).head()

Unnamed: 0_level_0,Principal Id,Distance
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
2,1,
3,1,8.20061
4,1,11.135529
5,1,3.741657
6,1,8.602325


In [9]:
# Função para indicar os usuários mais proximos de um usuário determinado 

def most_near(user_id):
  df_users_dist = all_users_dist(user_id)
  near = df_users_dist.sort_values('Distance', ascending = True).dropna()

  return near

In [10]:
most_near(1).head()

Unnamed: 0_level_0,Principal Id,Distance
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
77,1,0.0
511,1,0.5
366,1,0.707107
9,1,1.0
49,1,1.0


In [11]:
# Função para recomendar filmes baseado nos usuários mais proximos 

def recommend_movies(user_id, k = 100, m = 10):

    near_users = most_near(user_id).head(k).index.values

    near_user_data = df_ratings.loc[user_id].sort_values('rating', ascending = False).head(m).drop(columns= ['timestamp']).set_index('movieId')
    near_user_data = {'user': user_id, 'movies': [i for i in near_user_data.index.values], 'rates': [r for r in near_user_data.rating.values]}
    data = pd.DataFrame().from_dict(near_user_data).set_index('user')

    for c in near_users:
        if c == near_users[0]:
            pass

        near_user_data = df_ratings.loc[c].sort_values('rating', ascending = False).head(m).drop(columns= ['timestamp']).set_index('movieId')
            
        near_user_data = {'user': c, 'movies': [i for i in near_user_data.index.values], 'rates': [r for r in near_user_data.rating.values]}

        near_user_data = pd.DataFrame().from_dict(near_user_data).set_index('user')
            
        data = pd.concat([data, near_user_data])

    most_recommend_df = data.movies.value_counts().to_frame('rates_counts').sort_values('rates_counts', ascending = False)

    for c in user_grade(user_id).index.values:
        if c in most_recommend_df.index.values:
            most_recommend_df = most_recommend_df.drop(c)

    most_recommend_indexs = [k for k in most_recommend_df.index.values]
    most_recommend_df['mean_rates'] = [data.loc[data['movies'] == c].rates.mean() for c in most_recommend_indexs]

    most_recommend_movies = [df_movies.loc[i].title for i in most_recommend_indexs]
    most_recommend_df['movies'] = most_recommend_movies

    most_recommend_df = most_recommend_df.sort_values(['rates_counts', 'mean_rates'], ascending = False).head(m)

    recommend_movies = [m for m in most_recommend_df.movies]

    return recommend_movies

In [12]:
# Simulando quais as recomendações para um usuário novo

data = [102084, 5],[153, 4],[3793, 4],[5349, 5],[33794, 5],[60069, 4],[2706, 4],[723998, 4.5],[59315, 5]

def new_user(data):

    index = df_ratings.index.max()+1
    new_user = pd.DataFrame(data, columns=['movieId', 'rating'])
    new_user['userId'] = index
    new_user = new_user.set_index('userId')

    new_df_ratings = pd.concat([df_ratings, new_user])

    return new_df_ratings

In [13]:
df_ratings.tail()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
610,166534,4.0,1493848402
610,168248,5.0,1493850091
610,168250,5.0,1494273047
610,168252,5.0,1493846352
610,170875,3.0,1493846415


In [14]:
df_ratings = new_user(data)
df_ratings.tail()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
611,33794,5.0,
611,60069,4.0,
611,2706,4.0,
611,723998,4.5,
611,59315,5.0,


In [15]:
recommend_movies(611)

['Toy Story (1995)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Dark Knight, The (2008)',
 'Shawshank Redemption, The (1994)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'American History X (1998)',
 'Fight Club (1999)',
 'Back to the Future (1985)',
 'Monty Python and the Holy Grail (1975)']