In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv("dataset/ratings.csv", sep=',')
df.shape

(25000095, 4)

In [3]:
df = df.drop('timestamp', axis=1)
df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [4]:
user_stats = df.groupby('userId')['rating'].agg(['mean', 'std', 'count']).reset_index()
user_stats.columns = ['userId', 'mean_rating', 'std_rating', 'rating_count']
user_stats.head()

Unnamed: 0,userId,mean_rating,std_rating,rating_count
0,1,3.814286,1.004235,70
1,2,3.630435,1.457728,184
2,3,3.697409,0.599854,656
3,4,3.378099,1.116927,242
4,5,3.752475,0.931729,101


In [5]:
user_stats_temp = user_stats
dbscan = DBSCAN(eps=0.5, min_samples=50)
scaler = StandardScaler()
user_X = scaler.fit_transform(user_stats.drop('userId', axis=1))
user_stats_temp['cluster'] = dbscan.fit_predict(user_X)
user_stats_temp[user_stats_temp['cluster'] == -1].shape

(1805, 5)

In [6]:
user_silhouette_avg = silhouette_score(user_X, user_stats_temp['cluster'])
user_silhouette_avg

0.6952284842821462

In [7]:
included_users = user_stats_temp[user_stats_temp['cluster'] != -1]['userId']
filtered_df = df[df['userId'].isin(included_users)]
filtered_df.shape

(22441380, 3)

In [8]:
user_counts = filtered_df['userId'].value_counts()
movie_counts = df['movieId'].value_counts()
movie_counts = movie_counts[movie_counts > 10]

In [9]:
print(movie_counts.shape)
movie_counts.head()

(23357,)


movieId
356     81491
318     81482
296     79672
593     74127
2571    72674
Name: count, dtype: int64

In [10]:
print(user_counts.shape)
user_counts.head()

(160736,)


userId
120184    1703
81942     1699
83073     1673
154484    1672
15555     1667
Name: count, dtype: int64

In [11]:
top_users =   user_counts[:1600].index
top_movies = movie_counts[:1600].index
final_df = filtered_df[filtered_df['userId'].isin(top_users) & filtered_df['movieId'].isin(top_movies)]
final_df.shape

(1068207, 3)

In [12]:
rating_matrix = final_df.pivot(index='userId', columns='movieId', values='rating')

In [13]:
rating_matrix

movieId,1,2,3,5,6,7,9,10,11,12,...,166461,166528,168250,168252,171763,174055,176371,177593,177765,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
187,3.5,3.5,3.0,,,,,,,,...,,,,,,,,,,
426,2.5,,,,3.0,,,3.0,1.0,,...,,,0.5,,,,3.0,,,
653,5.0,3.5,,4.0,,4.0,,,4.0,,...,,2.5,,,,,,,,
757,3.0,3.0,,,,,,,,,...,,,,,,,,,,
803,5.0,,,,,,,,3.5,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161342,0.5,2.0,,,3.5,,1.0,3.5,,,...,4.0,3.0,,,2.5,4.5,4.0,4.5,,
161560,4.0,3.5,,,3.5,,,3.0,2.5,,...,,3.0,,3.0,3.5,,,4.0,,3.0
161675,3.5,2.5,,,5.0,,1.5,2.5,,,...,,,,3.0,,2.5,3.0,3.5,,3.0
161928,3.0,4.0,,,4.0,,,4.0,,,...,,,,,,,,,,


In [14]:
new_movie_id = {}
for i, old_movie_id in enumerate(rating_matrix.columns.to_list()):
    new_movie_id[old_movie_id] = i

In [15]:
new_user_id = {}
for i, old_user_id in enumerate(rating_matrix.index.to_list()):
    new_user_id[old_user_id] = i

In [16]:
final_df.loc[:,'userId'] = final_df['userId'].replace(new_user_id)
final_df.loc[:,'movieId'] = final_df['movieId'].replace(new_movie_id)

In [17]:
final_df.shape

(1068207, 3)

In [18]:
final_df.to_csv('dataset/clean_ratings.csv', index=False, sep=';')

In [19]:
movies_df = pd.read_csv("dataset/movies.csv", sep=',')
movies_df.shape

(62423, 3)

In [20]:
movies_df = movies_df[movies_df['movieId'].isin(top_movies)]
movies_df.loc[:,'movieId'] = movies_df['movieId'].replace(new_movie_id)

In [21]:
movies_df.shape

(1600, 3)

In [22]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,Jumanji (1995),Adventure|Children|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
4,3,Father of the Bride Part II (1995),Comedy
5,4,Heat (1995),Action|Crime|Thriller


In [23]:
genres_types = set()
movie_titles = list()
movie_genre = list()

for genres in movies_df['genres']:
    for genre in genres.split('|'):
        genres_types.add(genre)
        
genres = list(genres_types)
genre_dict = {genre: i for i, genre in enumerate(genres)}

for index, row in movies_df.iterrows():
    movie_titles.append(row['title'])
    movie_genre.append([genre_dict[genre] for genre in row['genres'].split('|')])     

In [None]:
import json

with open('config/master.json', 'r') as file:
    data = json.load(file)

data['movieGenreNames'] = genres
data['movieTitles'] = movie_titles
data['movieGenreIds'] = movie_genre

with open('config/master.json', 'w') as file:
    json.dump(data, file, indent=4)

In [None]:
data = dict()
data['genres'] = genres
with open('dataset/genres.json', 'w') as file:
    json.dump(data, file, indent=4)