In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import ssm
from ssm.util import find_permutation
from ssm.plots import gradient_cmap, white_to_color_cmap

color_names = [
    "windows blue",
    "red",
    "amber",
    "faded green",
    "dusty purple",
    "orange"
]
colors = sns.xkcd_palette(color_names)
cmap = gradient_cmap(colors)
plt.rcParams["figure.figsize"] = (30, 6)

# Provide data

In [2]:
# https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset

ratings=pd.read_csv('../data/rating.csv')
# links=pd.read_csv('links.csv')
# tags=pd.read_csv('tags.csv')
# genome_tags=pd.read_csv('genome-tags.csv')
# genome_scores=pd.read_csv('genome-scores.csv')
movies=pd.read_csv('../data/movie.csv')

In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [4]:
genres = pd.DataFrame({k: {g: True for g in v} for k, v in  movies.set_index('movieId').genres.apply(lambda gs: gs.split("|")).to_dict().items()}).fillna(False).transpose()

In [5]:
genres

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
1,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131254,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
131256,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
131258,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
131260,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [6]:
# ratings_pivot = ratings.pivot('userId', 'movieId', 'rating')

# Train models

The artificial user has the ID `0`

In [7]:
import pandas as pd
import numpy as np

# metrics for comparison
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# funk-svd package imports
from funk_svd.dataset import fetch_ml_ratings
from funk_svd.utils import _timer
from funk_svd import SVD as FSVD

In [126]:
ratings.rename(columns={"userId": 'u_id',  "movieId": "i_id"})

Unnamed: 0,u_id,i_id,rating,timestamp,u_id.1,i_id.1
0,1.0,2.0,3.5,2005-04-02 23:53:47,,
1,1.0,29.0,3.5,2005-04-02 23:31:16,,
2,1.0,32.0,3.5,2005-04-02 23:33:39,,
3,1.0,47.0,3.5,2005-04-02 23:32:07,,
4,1.0,50.0,3.5,2005-04-02 23:29:40,,
...,...,...,...,...,...,...
11,,,5.0,,113283.0,87960.0
12,,,5.0,,114508.0,27762.0
13,,,5.0,,115565.0,55460.0
14,,,5.0,,123903.0,93279.0


In [8]:
fsvd = FSVD(lr=0.001, reg=0.005, n_epochs=100, n_factors=15,
          early_stopping=True, shuffle=False, min_rating=1, max_rating=5)

fsvd.fit(X=ratings.rename(columns={"userId": 'u_id',  "movieId": "i_id"}))

Preprocessing data...

Epoch 1/100  | took 1.8 sec
Epoch 2/100  | took 1.5 sec
Epoch 3/100  | took 1.5 sec
Epoch 4/100  | took 1.5 sec
Epoch 5/100  | took 1.4 sec
Epoch 6/100  | took 1.4 sec
Epoch 7/100  | took 1.5 sec
Epoch 8/100  | took 1.4 sec
Epoch 9/100  | took 1.4 sec
Epoch 10/100 | took 1.4 sec
Epoch 11/100 | took 1.5 sec
Epoch 12/100 | took 1.5 sec
Epoch 13/100 | took 1.5 sec
Epoch 14/100 | took 1.5 sec
Epoch 15/100 | took 1.4 sec
Epoch 16/100 | took 1.5 sec
Epoch 17/100 | took 1.4 sec
Epoch 18/100 | took 1.5 sec
Epoch 19/100 | took 1.4 sec
Epoch 20/100 | took 1.5 sec
Epoch 21/100 | took 1.5 sec
Epoch 22/100 | took 1.5 sec
Epoch 23/100 | took 1.4 sec
Epoch 24/100 | took 1.5 sec
Epoch 25/100 | took 1.4 sec
Epoch 26/100 | took 1.4 sec
Epoch 27/100 | took 1.5 sec
Epoch 28/100 | took 1.4 sec
Epoch 29/100 | took 1.5 sec
Epoch 30/100 | took 1.5 sec
Epoch 31/100 | took 1.5 sec
Epoch 32/100 | took 1.5 sec
Epoch 33/100 | took 1.5 sec
Epoch 34/100 | took 1.5 sec
Epoch 35/100 | took 1.5 s

<funk_svd.svd.SVD at 0x7fd181f95e70>

In [9]:
all_movies = ratings.movieId.drop_duplicates().values
all_movies

array([     2,     29,     32, ..., 121021, 110167, 110510])

In [10]:
preds = fsvd.predict(pd.DataFrame({"u_id": [0 for _ in range(all_movies.shape[0])], "i_id": all_movies}))

In [11]:
k=50

res = pd.DataFrame({"movieId": all_movies, "pred": preds})
gen_at_k = genres.loc[res.sort_values("pred")[-k:].movieId.values].sum() / k

In [12]:
gen_at_k

Adventure             0.10
Animation             0.02
Children              0.00
Comedy                0.18
Fantasy               0.04
Romance               0.16
Drama                 0.58
Action                0.10
Crime                 0.14
Thriller              0.14
Horror                0.00
Mystery               0.10
Sci-Fi                0.02
IMAX                  0.00
Documentary           0.22
War                   0.10
Musical               0.00
Western               0.00
Film-Noir             0.00
(no genres listed)    0.00
dtype: float64

In [13]:
ratings.userId.value_counts().sort_values()[-10:]

59477     4988
83090     5169
131904    5330
34576     5356
74142     5447
125794    5491
121535    5520
82418     5646
8405      7515
118205    9254
Name: userId, dtype: int64

In [22]:
np.random.choice(ratings.userId.unique(), 10, replace=False).tolist()

[49750, 133810, 61792, 126170, 85275, 26201, 103329, 133627, 121793, 115338]

In [24]:
users = np.random.choice(ratings.userId.unique(), 16, replace=False)
all_movies = ratings.movieId.unique()

In [26]:
movies_available = [{"u_id": u, "i_id": i}  for u in users for i in np.setdiff1d(all_movies, ratings.loc[ratings.userId == u].movieId.unique())]

In [29]:
pd.DataFrame(movies_available)

Unnamed: 0,u_id,i_id
0,7642,1
1,7642,2
2,7642,3
3,7642,4
4,7642,7
...,...,...
426273,77656,131254
426274,77656,131256
426275,77656,131258
426276,77656,131260


In [30]:
movies_available = pd.DataFrame([{"u_id": u, "i_id": i} for u in users for i in np.setdiff1d(all_movies, ratings.loc[ratings.userId == u].movieId.unique())])

In [43]:
selected = movies_available.groupby("u_id").apply(lambda df: np.random.choice(df.index))

In [48]:
selected.shape[0]

16

In [55]:
pd.concat([movies_available.loc[selected].reset_index(drop=True), pd.DataFrame({"rating": np.random.choice([4, 5], selected.shape[0], p=[.15, .85])})], axis=1)

Unnamed: 0,u_id,i_id,rating
0,7642,25922,5
1,19779,2586,5
2,41549,70789,5
3,54417,2327,4
4,56721,49973,5
5,61274,5960,5
6,68973,4842,5
7,77656,40467,4
8,91829,121403,5
9,93917,1531,5


In [60]:
new_scores = pd.concat([movies_available.loc[selected].reset_index(drop=True), pd.DataFrame({"rating": np.random.choice([4, 5], selected.shape[0], p=[.15, .85])})], axis=1)
ratings = pd.concat([ratings, new_scores],  axis=0)

In [79]:
res_tmp = pd.concat([movies_available,  pd.DataFrame({"pred": fsvd.predict(movies_available)})], axis=1)
res_tmp = pd.concat([res_tmp,  genres.loc[res_tmp.i_id].reset_index(drop=True)], axis=1)
res_tmp.groupby("u_id").apply(lambda df:  df.drop(["u_id", "i_id",  "pred"],  axis=1).mean())

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7642,0.084997,0.03791,0.041618,0.307736,0.052257,0.15059,0.488668,0.129013,0.107848,0.153849,0.096947,0.055666,0.064169,0.007192,0.089567,0.043903,0.037947,0.024499,0.012062,0.009065
19779,0.085376,0.037916,0.041621,0.307931,0.052326,0.150578,0.488341,0.129431,0.107871,0.154396,0.096905,0.055695,0.064191,0.007224,0.089456,0.043867,0.037953,0.024554,0.012052,0.009058
41549,0.084961,0.03771,0.041203,0.307542,0.051645,0.150165,0.488657,0.129319,0.10806,0.154372,0.096943,0.055889,0.064153,0.007249,0.089769,0.043795,0.037485,0.024489,0.012094,0.00909
54417,0.085343,0.037934,0.041866,0.307594,0.052277,0.150764,0.488466,0.129494,0.107886,0.154321,0.096914,0.055647,0.06426,0.007302,0.089425,0.043889,0.038009,0.024566,0.012058,0.009062
56721,0.085095,0.037862,0.041573,0.30788,0.052182,0.150547,0.488567,0.12888,0.107737,0.153809,0.096941,0.055631,0.06399,0.007197,0.089631,0.043785,0.037937,0.024516,0.012071,0.009072
61274,0.084446,0.037673,0.041772,0.30823,0.052111,0.151145,0.489078,0.12742,0.107005,0.15295,0.096966,0.055269,0.062639,0.006956,0.08986,0.043689,0.038162,0.024514,0.012069,0.009099
68973,0.085389,0.037996,0.041777,0.30779,0.052259,0.150713,0.488489,0.129338,0.107775,0.15397,0.096882,0.055703,0.064238,0.007262,0.089507,0.043836,0.038034,0.02452,0.012054,0.009059
77656,0.085361,0.037938,0.041724,0.307441,0.052334,0.150253,0.48806,0.129447,0.107966,0.154189,0.09702,0.055783,0.064367,0.007235,0.089597,0.043786,0.037976,0.024442,0.012071,0.009072
91829,0.08499,0.037844,0.041782,0.307704,0.051947,0.150626,0.488485,0.12921,0.107606,0.153889,0.096842,0.055322,0.063986,0.007276,0.089641,0.043883,0.037994,0.024567,0.01204,0.009077
93917,0.085544,0.037907,0.041799,0.307712,0.05224,0.150732,0.488493,0.129514,0.107997,0.154361,0.096883,0.055682,0.064289,0.00726,0.089473,0.04382,0.037945,0.024473,0.01205,0.009056


In [88]:
res_tmp.groupby("u_id").apply(lambda df:  df.sort_values("pred")[-100:].drop(["u_id", "i_id",  "pred"],  axis=1).mean()).to_dict("index")

{7642: {'Adventure': 0.25,
  'Animation': 0.0,
  'Children': 0.01,
  'Comedy': 0.12,
  'Fantasy': 0.13,
  'Romance': 0.23,
  'Drama': 0.53,
  'Action': 0.51,
  'Crime': 0.2,
  'Thriller': 0.43,
  'Horror': 0.05,
  'Mystery': 0.13,
  'Sci-Fi': 0.16,
  'IMAX': 0.16,
  'Documentary': 0.01,
  'War': 0.09,
  'Musical': 0.04,
  'Western': 0.01,
  'Film-Noir': 0.0,
  '(no genres listed)': 0.0},
 19779: {'Adventure': 0.05,
  'Animation': 0.06,
  'Children': 0.01,
  'Comedy': 0.25,
  'Fantasy': 0.03,
  'Romance': 0.1,
  'Drama': 0.54,
  'Action': 0.09,
  'Crime': 0.17,
  'Thriller': 0.18,
  'Horror': 0.0,
  'Mystery': 0.13,
  'Sci-Fi': 0.06,
  'IMAX': 0.0,
  'Documentary': 0.23,
  'War': 0.11,
  'Musical': 0.0,
  'Western': 0.01,
  'Film-Noir': 0.05,
  '(no genres listed)': 0.0},
 41549: {'Adventure': 0.35,
  'Animation': 0.05,
  'Children': 0.06,
  'Comedy': 0.24,
  'Fantasy': 0.2,
  'Romance': 0.44,
  'Drama': 0.61,
  'Action': 0.21,
  'Crime': 0.01,
  'Thriller': 0.08,
  'Horror': 0.0,
  'My

In [84]:
res_tmp.sort_values("pred")[-100:]

Unnamed: 0,u_id,i_id,pred,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
374097,91829,1197,4.819244,True,False,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
200420,103110,68954,4.819812,True,True,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
187207,103110,527,4.820026,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
333902,115565,70186,4.820141,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
321339,115565,1262,4.821101,True,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320483,115565,356,5.000000,False,False,False,True,False,True,True,...,False,False,False,False,False,True,False,False,False,False
267308,41549,539,5.000000,False,False,False,True,False,True,True,...,False,False,False,False,False,False,False,False,False,False
267902,41549,1210,5.000000,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
267883,41549,1183,5.000000,False,False,False,False,False,True,True,...,False,False,False,False,False,True,False,False,False,False


In [95]:
genres.columns
GENRE1 = 'Documentary'
GENRE2 = 'IMAX'

In [100]:
genres.loc[genres[GENRE1] | genres[GENRE1]].index.values

array([    37,     77,     99, ..., 131074, 131100, 131110])

In [108]:
c = genres.cov().abs()

c[c ==  c.min()]

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
Adventure,,,,,,,,,,,,,,,,,,,,
Animation,,,,,,,,,,,,,,,,,,,,
Children,,,,,,,,,,,,,0.000155,,,,,,,
Comedy,,,,,,,,,,,,,,,,,,,,
Fantasy,,,,,,,,,,,,,,,,,,,,
Romance,,,,,,,,,,,,,,,,,,,,
Drama,,,,,,,,,,,,,,,,,,,,
Action,,,,,,,,,,,,,,,,,,,,
Crime,,,,,,,,,,,,,,,,,,,,
Thriller,,,,,,,,,,,,,,,,,,,,


In [125]:
genres.cov().loc[genres.sum().sort_values() > 3000,  genres.sum().sort_values() > 3000]

Unnamed: 0,Comedy,Romance,Drama,Action,Thriller
Comedy,0.212754,0.023025,-0.056841,-0.013256,-0.032723
Romance,0.023025,0.128409,0.020315,-0.009699,-0.013568
Drama,-0.056841,0.020315,0.249892,-0.019025,-0.006446
Action,-0.013256,-0.009699,-0.019025,0.112394,0.020892
Thriller,-0.032723,-0.013568,-0.006446,0.020892,0.129709


In [118]:
(genres.sum().sort_values() > 100).index.values

array(['IMAX', '(no genres listed)', 'Film-Noir', 'Western', 'Animation',
       'Musical', 'Children', 'War', 'Fantasy', 'Mystery', 'Sci-Fi',
       'Adventure', 'Documentary', 'Horror', 'Crime', 'Action', 'Romance',
       'Thriller', 'Comedy', 'Drama'], dtype=object)

In [140]:
pd.melt(tmp.reset_index(), id_vars='userId', value_vars=tmp.columns, var_name='movieId', value_name='rating')

Unnamed: 0,userId,movieId,rating
0,1.0,1.0,
1,2.0,1.0,
2,3.0,1.0,4.0
3,4.0,1.0,
4,5.0,1.0,
...,...,...,...
7673,7.0,31696.0,
7674,8.0,31696.0,
7675,9.0,31696.0,
7676,10.0,31696.0,


In [133]:
tmp = ratings[:1000].pivot('userId', 'movieId', 'rating')

In [134]:
tmp

movieId,1.0,2.0,3.0,6.0,7.0,10.0,11.0,15.0,16.0,17.0,...,7454.0,7482.0,7757.0,8368.0,8482.0,8507.0,8636.0,8690.0,8961.0,31696.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,3.5,,,,,,,,,...,4.0,3.0,4.0,4.0,3.5,5.0,4.5,3.5,4.0,4.0
2.0,,,4.0,,,,,,,,...,,,,,,,,,,
3.0,4.0,,,,,,,,,,...,,,,,,,,,,
4.0,,,,3.0,,4.0,,,,,...,,,,,,,,,,
5.0,,3.0,,,,,5.0,,,3.0,...,,,,,,,,,,
6.0,5.0,,3.0,,5.0,,,,,5.0,...,,,,,,,,,,
7.0,,,3.0,,3.0,,4.0,2.0,3.0,2.0,...,,,,,,,,,,
8.0,4.0,,5.0,3.0,,4.0,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,4.0,,,,,,4.0,,,,...,,,,,,,,,,


In [142]:
print("START")

ratings = pd.read_csv('../data/rating.csv').rename(columns={"userId": 'u_id',  "movieId": "i_id"})
movies = pd.read_csv('./../data/movie.csv').rename(columns={"movieId": "i_id"})
genres = pd.DataFrame({k: {g: True for g in v} for k, v in movies.set_index('i_id').genres.apply(lambda gs: gs.split("|")).to_dict().items()}).fillna(False).transpose()

print("Read data - DONE")

np.random.seed(2022)

users = np.random.choice(ratings.u_id.unique(), 16, replace=False)
all_movies = genres.index.values

movies_tmp = ratings.loc[ratings.u_id.isin(users)].pivot('u_id', 'i_id', 'rating')
movies_available = pd.melt(movies_tmp.reset_index(),
                           id_vars='u_id',
                           value_vars=movies_tmp.columns,
                           var_name='i_id', value_name='rating_real')

movies_available = pd.concat([movies_available, genres.loc[movies_available.i_id].reset_index(drop=True)], axis=1)

print("Prepare variables - DONE")

saturation_list = []

START
Read data - DONE
Prepare variables - DONE


In [143]:
movies_available

Unnamed: 0,u_id,i_id,rating_real,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,3501,1,,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,8388,1,,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,8512,1,,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,14549,1,4.0,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,15961,1,,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26715,74069,99728,,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
26716,83325,99728,,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
26717,100206,99728,,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
26718,110161,99728,3.5,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [149]:
lr = 0.010676587674837098
reg = 0.00010676587674837098
n_epochs = 20
n_factors = 10
import time
t = time.localtime()
RESULT_DIR = f'../../data/benchmark_rs/saturation-{t.tm_year}-{t.tm_mon}-{t.tm_mday}'


def get_rating(df, u, i):
    tmp = df.loc[(df.u_id == u) & (df.i_id == i)]
    if tmp.empty:
        return np.nan
    else:
        return tmp.values[0]

In [150]:
fsvd = FSVD(lr=lr, reg=reg, n_epochs=n_epochs, n_factors=n_factors,
            early_stopping=True, shuffle=False, min_rating=1, max_rating=5)
fsvd.fit(ratings)
preds = fsvd.predict(movies_available)

# calculate the saturation
res_tmp = pd.concat([movies_available, pd.DataFrame({"pred": preds})], axis=1)
# with open(f"{RESULT_DIR}/predictions_step_{i}.json", "w") as f:
#     json.dump(res_tmp, f)

saturation = res_tmp.groupby("u_id").apply(lambda df:  df.sort_values("pred")[-100:].drop(["u_id", "i_id",  "pred"],  axis=1).mean()).to_dict("index")

# append saturation to file
saturation_list.append(saturation)

# Add  selected movies to ratings list with 4s and 5s
genre_tmp = GENRE1 if (7 % 50) < 25 else GENRE2

selected = movies_available.loc[movies_available.i_id.isin(genres.index[genres[genre_tmp]]), :].groupby("u_id").apply(lambda df: np.random.choice(df.index))
new_scores = pd.concat([movies_available.loc[selected, ["u_id", "i_id"]].reset_index(drop=True),
                        pd.DataFrame({"rating": np.random.choice([4, 5], selected.shape[0], p=[.15, .85])})],
                       axis=1)
movies_available.loc[selected, 'rating'] = new_scores.rating.values
ratings = pd.concat([ratings, new_scores], axis=0)

# with open(f"{RESULT_DIR}/all_saturation.json", "w") as f:
# json.dump(saturation_list,  f)


Preprocessing data...

Epoch 1/20  | took 1.1 sec
Epoch 2/20  | took 1.1 sec
Epoch 3/20  | took 1.1 sec
Epoch 4/20  | took 1.1 sec
Epoch 5/20  | took 1.1 sec
Epoch 6/20  | took 1.0 sec
Epoch 7/20  | took 1.1 sec
Epoch 8/20  | took 1.1 sec
Epoch 9/20  | took 1.0 sec
Epoch 10/20 | took 1.1 sec
Epoch 11/20 | took 1.1 sec
Epoch 12/20 | took 1.1 sec
Epoch 13/20 | took 1.1 sec
Epoch 14/20 | took 1.0 sec
Epoch 15/20 | took 1.1 sec
Epoch 16/20 | took 1.1 sec
Epoch 17/20 | took 1.1 sec
Epoch 18/20 | took 1.1 sec
Epoch 19/20 | took 1.1 sec
Epoch 20/20 | took 1.0 sec

Training took 29 sec


In [151]:
preds

[3.833141426599853,
 4.091701615508929,
 4.166004044448721,
 3.9513336256637484,
 5,
 3.012117540541257,
 3.728638470242163,
 4.475027603366256,
 2.261228517640956,
 4.373897572386687,
 3.781207571071707,
 3.02933051038009,
 4.422962809331908,
 4.047696856443267,
 2.8823060371871474,
 4.279027178564912,
 3.7886320401195963,
 2.405906321844129,
 2.74184001138551,
 3.3725005495425098,
 3.2876733936258833,
 2.102744894006144,
 2.0922180477159587,
 3.838255957725468,
 1.4653316273805623,
 2.9784182763581537,
 3.353839176110328,
 3.432012689985191,
 3.3770276569190525,
 3.587764855450672,
 1.9180256754923237,
 3.5550180581889803,
 3.8599939079571937,
 3.4154503315140547,
 3.2395058710926947,
 3.3858155824232115,
 3.4600548282535475,
 2.5049070170255443,
 1.8294347971526994,
 3.8312360271556503,
 1.2139727331541734,
 2.7981979951452822,
 3.12593882821028,
 3.810548362665532,
 3.4213296934975146,
 2.821216779288808,
 2.311287246921545,
 3.8479994730949976,
 4.414616782288007,
 4.1997873419649

In [156]:
import json
json.dumps(saturation_list)

'[{"3501": {"rating_real": 4.777777777777778, "Adventure": 0.12, "Animation": 0.01, "Children": 0.03, "Comedy": 0.21, "Fantasy": 0.01, "Romance": 0.26, "Drama": 0.7, "Action": 0.31, "Crime": 0.19, "Thriller": 0.39, "Horror": 0.04, "Mystery": 0.12, "Sci-Fi": 0.06, "IMAX": 0.02, "Documentary": 0.0, "War": 0.09, "Musical": 0.01, "Western": 0.02, "Film-Noir": 0.0, "(no genres listed)": 0.0}, "8388": {"rating_real": 4.0, "Adventure": 0.33, "Animation": 0.06, "Children": 0.03, "Comedy": 0.41, "Fantasy": 0.12, "Romance": 0.06, "Drama": 0.35, "Action": 0.47, "Crime": 0.29, "Thriller": 0.21, "Horror": 0.01, "Mystery": 0.1, "Sci-Fi": 0.15, "IMAX": 0.1, "Documentary": 0.02, "War": 0.09, "Musical": 0.02, "Western": 0.04, "Film-Noir": 0.01, "(no genres listed)": 0.0}, "8512": {"rating_real": 4.4, "Adventure": 0.18, "Animation": 0.1, "Children": 0.11, "Comedy": 0.33, "Fantasy": 0.09, "Romance": 0.3, "Drama": 0.66, "Action": 0.12, "Crime": 0.18, "Thriller": 0.13, "Horror": 0.01, "Mystery": 0.1, "Sci-