In [1]:
import pandas as pd
import json

In [3]:
# depth of this file in the project
file_depth = '../..'

In [4]:
with open(file_depth + '/config/data_1m_config.json') as config_file:
    config = json.load(config_file)

movies_data_path = config['original_csv_movies_data']
ratings_data_path = config['original_csv_ratings_data']
users_data_path = config['original_csv_users_data']

In [5]:
movies = pd.read_csv(file_depth + movies_data_path)
movies

Unnamed: 0,movie_id,title,genres,movie_year
0,1,Toy Story (1995),Animation|ForChildren|Comedy,1995
1,2,Jumanji (1995),Adventure|ForChildren|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,2000
3879,3949,Requiem for a Dream (2000),Drama,2000
3880,3950,Tigerland (2000),Drama,2000
3881,3951,Two Family House (2000),Drama,2000


In [6]:
ratings = pd.read_csv(file_depth + ratings_data_path)
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [7]:
users = pd.read_csv(file_depth + users_data_path)
users

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,1060


In [8]:
# get all unique genres
unique_genres = movies['genres'].str.split('|', expand=True).stack().unique()

In [9]:
print(unique_genres)
print('Number of unique genres: ', unique_genres.shape[0])

['Animation' 'ForChildren' 'Comedy' 'Adventure' 'Fantasy' 'Romance'
 'Drama' 'Action' 'Crime' 'Thriller' 'Horror' 'Sci-Fi' 'Documentary' 'War'
 'Musical' 'Mystery' 'Film-Noir' 'Western']
Number of unique genres:  18


------

In [10]:
# ratings of 4 or 5
filtered_ratings = ratings[(ratings['rating'] == 4) | (ratings['rating'] == 5)]
filtered_ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
3,1,3408,4,978300275
4,1,2355,5,978824291
6,1,1287,5,978302039
7,1,2804,5,978300719
...,...,...,...,...
1000202,6040,1089,4,956704996
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [11]:
ratings_with_genres = filtered_ratings.merge(movies, on='movie_id', how='inner').drop(columns=['timestamp', 'title', 'movie_year'])
ratings_with_genres

Unnamed: 0,user_id,movie_id,rating,genres
0,1,1193,5,Drama
1,2,1193,5,Drama
2,12,1193,4,Drama
3,15,1193,4,Drama
4,17,1193,5,Drama
...,...,...,...,...
575276,5851,3607,5,Comedy|Drama|Western
575277,5854,3026,4,Horror
575278,5938,2909,4,Documentary
575279,5948,1360,5,Drama


In [15]:
grouped_data = ratings_with_genres.groupby('user_id')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018D12C0E3B0>

In [13]:
user_vectors = {}

for user_id, group in grouped_data:
    user_genre_vectors = {}
    
    for genre in unique_genres:
        split_genres = group['genres'].str.split('|')
        movies_in_genre_count = split_genres.apply(lambda x: genre in x).sum()
        user_genre_vectors[genre] = int(movies_in_genre_count)

    user_vectors[user_id] = user_genre_vectors

In [14]:
with open('rating_user_data/user_category.json', 'w') as fp:
    json.dump(user_vectors, fp)