In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict


In [2]:
df_ratings = pd.read_csv('ratings_without_timestamp.csv')
df_movies = pd.read_csv('df_movies_final.csv')

In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
userIDs, itemIDs = set(), set()

movies = df_movies['movieId']
users = df_ratings['userId']

for item in movies:
    itemIDs.add(item)

for user in users:
    userIDs.add(user)

In [5]:
print(f'itemIDs: {len(itemIDs)}')
print(f'userIDs: {len(userIDs)}')

itemIDs: 9742
userIDs: 610


In [6]:
interactions_count = df_ratings.groupby('movieId').size().to_dict()

In [7]:
movie_stats = df_ratings.groupby('movieId')['rating'].agg(['mean', 'median'])

In [8]:
movie_stats

Unnamed: 0_level_0,mean,median
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.920930,4.0
2,3.431818,3.5
3,3.259615,3.0
4,2.357143,3.0
5,3.071429,3.0
...,...,...
193581,4.000000,4.0
193583,3.500000,3.5
193585,3.500000,3.5
193587,3.500000,3.5


In [9]:
df_movies_with_score = df_movies.merge(movie_stats, on='movieId', how='left')

In [10]:
df_movies_with_score['interactions'] = df_movies_with_score['movieId'].map(interactions_count).fillna(0).astype(int)

In [11]:
df_movies_with_score.isnull().sum()

movieId                0
title                  0
year                   0
(no genres listed)     0
Action                 0
Adventure              0
Animation              0
Children               0
Comedy                 0
Crime                  0
Documentary            0
Drama                  0
Fantasy                0
Film-Noir              0
Horror                 0
IMAX                   0
Musical                0
Mystery                0
Romance                0
Sci-Fi                 0
Thriller               0
War                    0
Western                0
mean                  18
median                18
interactions           0
dtype: int64

In [12]:
np.unique(df_movies_with_score['median'])

array([0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  , 2.25, 2.5 , 2.75, 3.  ,
       3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75, 5.  ,  nan,  nan,  nan,
        nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan,  nan,  nan])

In [13]:
df_movies_with_score = df_movies_with_score.fillna(0)

In [14]:
df_movies_with_score.head()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,mean,median,interactions
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,3.92093,4.0,215
1,2,Jumanji,1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,3.431818,3.5,110
2,3,Grumpier Old Men,1995,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,3.259615,3.0,52
3,4,Waiting to Exhale,1995,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,2.357143,3.0,7
4,5,Father of the Bride Part II,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,3.071429,3.0,49


In [15]:
df_movies_with_score.to_csv("df_movies_with_score.csv", index=False)