In [1]:
import pandas as pd
import json

In [2]:
# import numpy as np
# from joblib import dump

In [3]:
# depth of this file in the project
file_depth = '../..'

In [4]:
with open(file_depth + '/config/data_1m_config.json') as config_file:
    config = json.load(config_file)

movies_data_path = config['original_csv_movies_data']
ratings_data_path = config['original_csv_ratings_data']
users_data_path = config['original_csv_users_data']

In [5]:
movies = pd.read_csv(file_depth + movies_data_path)
movies

Unnamed: 0,movie_id,title,genres,movie_year
0,1,Toy Story (1995),Animation|ForChildren|Comedy,1995
1,2,Jumanji (1995),Adventure|ForChildren|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,2000
3879,3949,Requiem for a Dream (2000),Drama,2000
3880,3950,Tigerland (2000),Drama,2000
3881,3951,Two Family House (2000),Drama,2000


In [6]:
ratings = pd.read_csv(file_depth + ratings_data_path)
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [7]:
users = pd.read_csv(file_depth + users_data_path)
users

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,1060


In [8]:
# get all unique genres
unique_genres = movies['genres'].str.split('|', expand=True).stack().unique()

In [9]:
print(unique_genres)
print('Number of unique genres: ', unique_genres.shape[0])

['Animation' 'ForChildren' 'Comedy' 'Adventure' 'Fantasy' 'Romance'
 'Drama' 'Action' 'Crime' 'Thriller' 'Horror' 'Sci-Fi' 'Documentary' 'War'
 'Musical' 'Mystery' 'Film-Noir' 'Western']
Number of unique genres:  18


In [10]:
genre_dataframes = {}

# Iterate through unique genres and filter the original DataFrame
for genre in unique_genres:
    genre_dataframes[genre] = movies[movies['genres'].str.contains(genre)]

In [11]:
for genre in unique_genres:
    genre_df = genre_dataframes.get(genre)
    if genre_df is not None:
        print('----- ', genre, ' (' , genre_df.shape[0], ') -----')
        print(genre_df[['movie_id', 'title', 'movie_year']].head())
        print('')

-----  Animation  ( 105 ) -----
     movie_id                    title  movie_year
0           1         Toy Story (1995)        1995
12         13             Balto (1995)        1995
47         48        Pocahontas (1995)        1995
236       239    Goofy Movie, A (1995)        1995
241       244  Gumby: The Movie (1995)        1995

-----  ForChildren  ( 251 ) -----
    movie_id                title  movie_year
0          1     Toy Story (1995)        1995
1          2       Jumanji (1995)        1995
7          8  Tom and Huck (1995)        1995
12        13         Balto (1995)        1995
33        34          Babe (1995)        1995

-----  Comedy  ( 1200 ) -----
   movie_id                               title  movie_year
0         1                    Toy Story (1995)        1995
2         3             Grumpier Old Men (1995)        1995
3         4            Waiting to Exhale (1995)        1995
4         5  Father of the Bride Part II (1995)        1995
6         7         

# ------

In [12]:
# ratings of 4 or 5
filtered_ratings = ratings[(ratings['rating'] == 4) | (ratings['rating'] == 5)]
filtered_ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
3,1,3408,4,978300275
4,1,2355,5,978824291
6,1,1287,5,978302039
7,1,2804,5,978300719
...,...,...,...,...
1000202,6040,1089,4,956704996
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [13]:
ratings_with_genres = filtered_ratings.merge(movies, on='movie_id', how='inner')
ratings_with_genres

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,movie_year
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1975
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama,1975
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama,1975
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama,1975
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama,1975
...,...,...,...,...,...,...,...
575276,5851,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western,1973
575277,5854,3026,4,958346883,Slaughterhouse (1987),Horror,1987
575278,5938,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)",Documentary,1998
575279,5948,1360,5,1016563709,Identification of a Woman (Identificazione di ...,Drama,1982


In [14]:
grouped_data = ratings_with_genres.groupby('user_id')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000029D8356F910>

In [23]:
user_vectors = {}

for user_id, group in grouped_data:
    user_genre_vectors = {}
    
    for genre in unique_genres:
                
        split_genres = group['genres'].str.split('|')
        is_genre_in_movie = split_genres.apply(lambda x: genre in x)
        genre_movies = group[is_genre_in_movie]

        user_genre_vectors[genre] = genre_movies['movie_id'].tolist()
    
    user_vectors[user_id] = user_genre_vectors

In [31]:
with open('user_vectors2.json', 'w') as fp:
    json.dump(user_vectors, fp)