In [61]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM

                                                      seq  len_seq    next
0       [7261, 7261, 7261, 7261, 7261, 7261, 7261, 726...        1  4682.0
1       [4682.0, 7261, 7261, 7261, 7261, 7261, 7261, 7...        1  3638.0
2       [4682.0, 3638.0, 7261, 7261, 7261, 7261, 7261,...        2   107.0
3       [4682.0, 3638.0, 107.0, 7261, 7261, 7261, 7261...        3   102.0
4       [4682.0, 3638.0, 107.0, 102.0, 7261, 7261, 726...        4   125.0
...                                                   ...      ...     ...
604817  [7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...        7  3376.0
604818  [7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...        8  3383.0
604819  [7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...        9  3145.0
604820  [7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...       10  4390.0
604821  [3313.0, 692.0, 3051.0, 4047.0, 5180.0, 4255.0...       10  5208.0

[604822 rows x 3 columns]


In [2]:
from tqdm import tqdm
import pandas as pd

class MoviesDataBuilder:
    def __init__(self, rating_path='Recommender/movie_dataset/rating.csv', meta_data_path='Recommender/movie_dataset/movie.csv', number_of_users=None):
        self.rating_path = rating_path
        self.meta_data_path = meta_data_path
        self.number_of_users = number_of_users
        
        self.df_rating = pd.read_csv(self.rating_path)
        self.df_meta = pd.read_csv(self.meta_data_path)
        
        self.df_rating = pd.merge(self.df_rating, self.df_meta, on='movieId')
        
        self.users = self.df_rating.userId.unique()
        self.df_rating = self.df_rating[self.df_rating.rating >= 4.0]
        
    def build_user(self):
        # Build users' preference
        self.users_pref = []
        self.movies_genres = []
        
        if self.number_of_users is not None: self.users = self.users[:self.number_of_users]
        
        for user in tqdm(self.users[:self.number_of_users], desc='Building user preferences'):
            userData = self.df_rating[self.df_rating.userId == user].sort_values('timestamp')[-11:]
            self.users_pref.append(userData.movieId.to_list())
            
            # The last item is the target, so the genres must not include its genre to prevent data leak
            genres = userData.genres.to_list()[:-1]
            genres = [",".join(item.split('|')) for item in genres]
            genres = "|".join(genres)
            self.movies_genres.append(genres)
    
    def build_dataset(self, max_length_size=10):
        features = []
        target = []
        len_seq = []
        meta_data = []
        
        for index, movies_set in tqdm(enumerate(self.users_pref), desc='Building dataset'):
            length = len(movies_set)
            if length < 2:
                continue
            
            meta_data.append(self.movies_genres[index])
            
            feature_vector = movies_set[:-1]
            target.append(movies_set[-1])
            len_seq.append((length-1))
            if max_length_size > (length-1):
                diff = max_length_size - (length-1)
                feature_vector.extend([feature_vector[-1]]*diff)
            
            assert len(feature_vector) == max_length_size, f"All of the rows must include {max_length_size} items"
            features.append(feature_vector)
            
        
        self.dataset = pd.DataFrame({
                                    'seq': features,
                                    'len_seq': len_seq,
                                    'target': target,
                                    'genres': meta_data
                                })
        
        self.dataset.to_csv('Recommender/movie_dataset/processed_dataset.csv', index=False)


In [3]:
data_builder = MoviesDataBuilder()

In [None]:
data_builder.build_user()

In [None]:
data_builder.build_dataset()

In [32]:
template_data = pd.read_pickle('Recommender/data/ks/train_data.df')
template_data

Unnamed: 0,seq,len_seq,next
0,"[7261, 7261, 7261, 7261, 7261, 7261, 7261, 726...",1,4682.0
1,"[4682.0, 7261, 7261, 7261, 7261, 7261, 7261, 7...",1,3638.0
2,"[4682.0, 3638.0, 7261, 7261, 7261, 7261, 7261,...",2,107.0
3,"[4682.0, 3638.0, 107.0, 7261, 7261, 7261, 7261...",3,102.0
4,"[4682.0, 3638.0, 107.0, 102.0, 7261, 7261, 726...",4,125.0
...,...,...,...
604817,"[7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...",7,3376.0
604818,"[7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...",8,3383.0
604819,"[7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...",9,3145.0
604820,"[7130.0, 3313.0, 692.0, 3051.0, 4047.0, 5180.0...",10,4390.0


In [33]:
data_builder.dataset

Unnamed: 0,seq,len_seq,target,genres
0,"[7046, 2143, 2100, 4911, 6754, 2628, 3489, 738...",10,3889,"Comedy,Fantasy,Horror,Thriller|Adventure,Fanta..."
1,"[1544, 3930, 3923, 260, 541, 1748, 1196, 1214,...",10,3703,"Action,Adventure,Sci-Fi,Thriller|Adventure,Hor..."
2,"[2532, 1544, 2615, 329, 1831, 2668, 2808, 1882...",10,2034,"Action,Sci-Fi|Action,Adventure,Sci-Fi,Thriller..."
3,"[377, 586, 350, 368, 370, 594, 520, 733, 431, ...",10,489,"Action,Romance,Thriller|Children,Comedy|Drama,..."
4,"[1079, 594, 1035, 316, 1028, 1080, 1196, 1198,...",10,1136,"Comedy,Crime|Animation,Children,Drama,Fantasy,..."
...,...,...,...,...
138049,"[1203, 4973, 1207, 745, 1148, 2571, 5291, 1252...",10,2858,"Drama|Comedy,Romance|Drama|Animation,Children,..."
138050,"[110, 1563, 3255, 1061, 1683, 1794, 531, 85, 2...",10,1897,"Action,Drama,War|Drama|Comedy,Drama|Thriller|D..."
138051,"[1093, 4128, 3186, 2857, 2857, 2857, 2857, 285...",4,6874,"Drama|Comedy,Horror,Thriller|Drama|Adventure,A..."
138052,"[1307, 4963, 441, 6863, 471, 1220, 3039, 1968,...",10,2791,"Comedy,Romance|Crime,Thriller|Comedy|Comedy,Mu..."


## Set tag for each genre

In [35]:
import random
import pandas as pd

In [3]:
df = pd.read_csv('Recommender/movie_dataset/processed_dataset.csv')

In [23]:
genres_set = set()

In [24]:
for row in df.to_numpy():
    genres = row[3]
    for sub_genre in genres.split('|'):
        for genre in sub_genre.split(','):
            genres_set.add(genre)

In [57]:
genres_list = list(genres_set)
genres_id = list(zip(range(1, len(genres_list)+1), genres_list))

In [61]:
pd.DataFrame(genres_id, columns=['genre_id', 'genre']).to_csv('Recommender/movie_dataset/genres.csv', index=False)

In [88]:
movie_meta_data = pd.read_csv('Recommender/movie_dataset/movie.csv')
genres_meta_data = pd.read_csv('Recommender/movie_dataset/genres.csv')

def encode_genres(row):
    genres = row['genres']
    genres_list = list()
    for sub_genre in genres.split('|'):
        genre = random.choice(sub_genre.split(','))
        genre_id = genres_meta_data[genres_meta_data.genre == genre].genre_id.values[0]
        genres_list.append(int(genre_id))
    
    row['seq_genres'] = genres_list
    target_genres = movie_meta_data[movie_meta_data.movieId==row.target].genres.values[0].split('|')
    target_genre = random.choice(target_genres)
    target_genre_id = genres_meta_data[genres_meta_data.genre == target_genre].genre_id.values[0]
    row['target_genre'] = target_genre_id
    return row
    

In [90]:
df = df.apply(encode_genres, axis=1)