# Sistema de Recomendação por Filtragem Colaborativa

In [1]:
import numpy as np
import pandas as pd

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('data/movielens/u.user', sep='|', names=u_cols, encoding='latin-1')

In [3]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
i_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('data/movielens/u.item', sep='|', names=i_cols, encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Sci-Fi,Thriller,War,Western
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1682 entries, 1 to 1682
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   object 
 1   title               1681 non-null   object 
 2   release_date        0 non-null      float64
 3   video_release_date  1679 non-null   object 
 4   IMDb_URL            1682 non-null   int64  
 5   unknown             1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children's          1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

In [6]:
movies = movies[['movie_id', 'title']]

In [7]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('data/movielens/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings.head(1)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949


In [8]:
ratings.drop(columns=['timestamp'], inplace=True)

In [9]:
# configuração de treino e teste
from sklearn.model_selection import train_test_split

# X ratings originais e y dataframe e y user_id
X = ratings.copy()
y = ratings['user_id']

# Quebra em treino e teste, estratificando em user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [10]:
from sklearn.metrics import mean_squared_error

# Computa o RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [11]:
def baseline(user_id, movie_id):
    return 3.0

In [12]:
def score(cf_model):

    id_pairs = zip(X_test['user_id'], X_test['movie_id'])

    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])

    y_true = np.array(X_test['rating'])

    return rmse(y_true, y_pred)

In [13]:
score(baseline)

1.2470926188539486

In [14]:
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1673,1674,1675,1676,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


## Avaliação **média** por filme

In [15]:
def cf_user_mean(user_id, movie_id):

    if movie_id in r_matrix:
        mean_rating = r_matrix[movie_id].mean()
    else:
        mean_rating = 3.0

    return mean_rating

In [16]:
score(cf_user_mean)

1.0234701463131335

## Média ponderada

$r_{u, m} = \frac{\sum_{u', u' \neq u} sim(u, u') . r_{u', m}}{\sum_{u', u' \neq u} |sim(u, u')|}$

In [17]:
r_matrix_dummy = r_matrix.copy().fillna(0)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [19]:
consine_sim = pd.DataFrame(cosine_sim, index=r_matrix_dummy.index, columns=r_matrix_dummy.index)
consine_sim.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.118076,0.029097,0.011628,0.264677,0.312419,0.308729,0.224269,0.026017,0.286411,...,0.308475,0.055872,0.197862,0.131367,0.152449,0.084456,0.293293,0.056765,0.103536,0.326491
2,0.118076,1.0,0.099097,0.10768,0.034279,0.152789,0.086705,0.078864,0.06894,0.092399,...,0.086927,0.259636,0.289092,0.318824,0.149105,0.186347,0.168034,0.106748,0.136796,0.080358
3,0.029097,0.099097,1.0,0.252131,0.026893,0.062539,0.039767,0.089474,0.078162,0.03767,...,0.040918,0.019031,0.065417,0.055373,0.086503,0.018418,0.096993,0.109631,0.092574,0.018987
4,0.011628,0.10768,0.252131,1.0,0.0,0.045543,0.078812,0.095354,0.059498,0.053879,...,0.024226,0.050703,0.056561,0.107294,0.098892,0.0,0.1329,0.142798,0.097066,0.015176
5,0.264677,0.034279,0.026893,0.0,1.0,0.202843,0.299619,0.163724,0.038474,0.153021,...,0.262547,0.048524,0.048312,0.022202,0.09191,0.066,0.156172,0.115842,0.124297,0.267574
6,0.312419,0.152789,0.062539,0.045543,0.202843,1.0,0.375963,0.131795,0.110944,0.400758,...,0.287549,0.080312,0.162988,0.182856,0.114262,0.09209,0.261859,0.097606,0.206104,0.187637
7,0.308729,0.086705,0.039767,0.078812,0.299619,0.375963,1.0,0.211282,0.107795,0.328923,...,0.290002,0.07417,0.094619,0.084235,0.11562,0.100625,0.233843,0.039199,0.224227,0.296332
8,0.224269,0.078864,0.089474,0.095354,0.163724,0.131795,0.211282,1.0,0.03704,0.183375,...,0.165008,0.066843,0.058766,0.068759,0.087159,0.129381,0.188662,0.121223,0.08391,0.273238
9,0.026017,0.06894,0.078162,0.059498,0.038474,0.110944,0.107795,0.03704,1.0,0.155435,...,0.011708,0.0,0.10171,0.034568,0.045002,0.052699,0.107486,0.055766,0.070065,0.088281
10,0.286411,0.092399,0.03767,0.053879,0.153021,0.400758,0.328923,0.183375,0.155435,1.0,...,0.278558,0.04931,0.153506,0.065471,0.060088,0.033686,0.197107,0.085402,0.118945,0.162538


In [20]:
def cf_user_wmean(user_id, movie_id):
    if (movie_id in r_matrix) & (user_id in cosine_sim):
        # Obtem a similiariedade do usuário com os outros
        sim_scores = pd.Series(cosine_sim[user_id])

        # Obtem a avaliação do usuário para o filme
        m_ratings = r_matrix_dummy[movie_id]

        # Extrai os indices que contem nan
        idx = m_ratings[m_ratings.isnull()].index

        m_ratings = m_ratings.dropna()

        # # dropa os escores de cosseno correspondentes
        
        sim_scores = sim_scores.drop(idx)

        # # computa a média poderada final
        wmean_ratings = np.dot(sim_scores, m_ratings) / sim_scores.sum()
    else:
        wmean_ratings = 3.0

    return wmean_ratings

In [21]:
score(cf_user_wmean)

1.2583816989695245

## Demographics


In [22]:
merged_df = pd.merge(X_train, users)
merged_df.head(3)

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,889,684,2,24,M,technician,78704
1,889,279,2,24,M,technician,78704
2,889,29,3,24,M,technician,78704


In [23]:
gerder_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()
gerder_mean

movie_id  sex
1         F      3.827586
          M      3.918919
2         F      3.230769
          M      3.228916
3         F      2.785714
                   ...   
1675      M      3.000000
1676      M      2.000000
1679      M      3.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3048, dtype: float64

In [24]:
users = users.set_index('user_id')
users

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [25]:
def cf_gender(user_id, movie_id):

    if movie_id in r_matrix:

        gender = users.loc[user_id]['sex']

        if gender in gerder_mean[movie_id]:
            gender_rating = gerder_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0

    return gender_rating

In [26]:
score(cf_gender)

1.0330308800874282

## Model Based

### Rede Neural

In [30]:
!pip install tensorflow --user



In [32]:
import tensorflow as tf
from zipfile import ZipFile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import get_file

In [33]:
URL = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
movielens_path = get_file('movielens.zip', URL, extract=True)

Downloading data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip


In [34]:
with ZipFile(movielens_path) as z:
    with z.open('ml-latest-small/ratings.csv') as f:
        df = pd.read_csv(f)
print(df.shape)
df.head(3)

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [35]:
# Processando os ids dos usuários - padronização
user_ids = df['userId'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i:x for i, x in enumerate(user_ids)}
df['user'] = df['userId'].map(user2user_encoded)
num_users = len(user_encoded2user)

In [37]:
# Processando os ids dos Filmes - padronização
movie_ids = df['movieId'].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df['movie'] = df['movieId'].map(movie2movie_encoded)
num_movies = len(movie_encoded2movie)

In [38]:
print('Number of users: ', num_users, '\nNumber of Movies: ', num_movies)

Number of users:  610 
Number of Movies:  9724


In [40]:
# normalizar as avaliações com minmaz (eficiência)

min, max = df['rating'].min(), df['rating'].max()
df['rating'] = df['rating'].apply(lambda x: (x-min)/(max-min))
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
0,1,1,0.777778,964982703,0,0
1,1,3,0.777778,964981247,0,1
2,1,6,0.777778,964982224,0,2


In [41]:
# Definição do X e y para treino e teste
X = df[['user', 'movie']].values
y = df['rating'].values

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

In [43]:
print('Shape of X_train ', X_train.shape)
print('Shape of X_test ', X_test.shape)
print('Shape of y_train ', y_train.shape)
print('Shape of y_test ', y_test.shape)

Shape of X_train  (90752, 2)
Shape of X_test  (10084, 2)
Shape of y_train  (90752,)
Shape of y_test  (10084,)


In [53]:
class RecommenderNet(tf.keras.Model):
    # __init__ function is to initialize the values of instance members for the new object

    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)

        # Variables for embedding size
        self.embedding_size = embedding_size
        self.user_embedding = Embedding(
            num_users,
            embedding_size, 
            embeddings_initializer='he_normal',embeddings_regularizer=tf.keras.regularizers.l2(1e-6))     
        self.user_bias = Embedding(num_users, 1)
        # Variables for movie count, and related weigths and biases

        self.movie_embedding = Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer='he_normal',
            embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        self.movie_bias = Embedding(num_movies, 1)

    def call(self, inputs):
        # call functions is for the dot products
        # of user and movie vectors
        # it also accepts the inputs, feed the into the layers, 
        # and feed into the final sigmoid layer
        # User vector and bias values with input values
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:,0])

        # Movie vector and bias values with input values
        movie_vector = self.movie_embedding(inputs[:,1])
        movie_bias = self.movie_bias(inputs[:,1])
        # tf.tensordot calculates the dot products
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        return tf.nn.sigmoid(x)



In [54]:
model = RecommenderNet(num_users, num_movies, embedding_size=50)

In [55]:
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=0.001))

In [56]:
history = model.fit(
    x=X_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(X_test, y_test)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [57]:
user_id = df['userId'].sample(1).iloc[0]
print('The selected user ID is: ', user_id)

The selected user ID is:  599


In [60]:
# filtrar filmes que o usuário já assistiu
movies_watched = df[df['userId'] == user_id]
not_watched = df[~df['movieId'].isin(movies_watched['movieId'].values)]['movieId'].unique()
not_watched = [[movie2movie_encoded.get(x)] for x in not_watched]
print('The number of movies the user has not see: ', len(not_watched))

The number of movies the user has not see:  7246


In [63]:
# Obtem id atualizado do usuario e prediz os filmes para ele
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(not_watched), not_watched))
ratings = model.predict(user_movie_array).flatten()

In [65]:
# Ordena itense retorna indices dos filmes
top_10_indices = ratings.argsort()[-10:][::-1]

In [66]:
# pega o id origin do filme
recommended_movie_ids = [movie_encoded2movie.get(not_watched[x][0]) for x in top_10_indices]

In [67]:
with ZipFile(movielens_path) as z:
    with z.open('ml-latest-small/movies.csv') as f:
        movie_df = pd.read_csv(f)
movie_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [68]:
# resgatar os nomes dos filmes que o usuário assistiu com valores altos de avaliações
top_movies_user = (
    movies_watched.sort_values(by='rating', ascending=False).head(10)['movieId'].values)

movie_df_rows = movie_df[movie_df['movieId'].isin(top_movies_user)]

print('Movies with high ratings for user')
movie_df_rows[['title', 'genres']]

Movies with high ratings for user


Unnamed: 0,title,genres
596,Ghost in the Shell (Kôkaku kidôtai) (1995),Animation|Sci-Fi
602,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
706,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
731,His Girl Friday (1940),Comedy|Romance
883,Paths of Glory (1957),Drama|War
982,High Noon (1952),Drama|Western
2382,Magnolia (1999),Drama
2568,Double Indemnity (1944),Crime|Drama|Film-Noir
4529,Lost in Translation (2003),Comedy|Drama|Romance
4615,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller


In [72]:
# top 10 que a filtragem colaborativa recomendaria
recommeded_movies = movie_df[movie_df['movieId'].isin(recommended_movie_ids)]
print('Top 10 movies recommendations')
recommeded_movies[['title', 'genres']]

Top 10 movies recommendations


Unnamed: 0,title,genres
413,In the Name of the Father (1993),Drama
461,Schindler's List (1993),Drama|War
792,"Sound of Music, The (1965)",Musical|Romance
796,Secrets & Lies (1996),Drama
896,One Flew Over the Cuckoo's Nest (1975),Drama
975,Cool Hand Luke (1967),Drama
1762,"Celebration, The (Festen) (1998)",Drama
6922,Gran Torino (2008),Crime|Drama
7515,Limitless (2011),Sci-Fi|Thriller
8274,Captain Phillips (2013),Adventure|Drama|Thriller|IMAX
