In [None]:
from psutil import virtual_memory
from tensorflow.python.client import device_lib
dev_sec = device_lib.list_local_devices()
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')
  print(dev_sec[-1].physical_device_desc)

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!
device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0


In [None]:
!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
!unzip ml-100k.zip
!ls

--2020-11-11 16:01:28--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2020-11-11 16:01:29 (5.22 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

In [None]:
# install category_encoders library
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 21.1MB/s eta 0:00:01[K     |████████▏                       | 20kB 6.5MB/s eta 0:00:01[K     |████████████▏                   | 30kB 7.3MB/s eta 0:00:01[K     |████████████████▎               | 40kB 8.4MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 7.1MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 7.7MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 8.2MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.2MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
import math
import random
from collections import deque
import category_encoders as ce

  import pandas.util.testing as tm


In [None]:
# Functions

def BackwardDifferenceEncoder(data):
  encoder = LabelEncoder()
  encoder = ce.BackwardDifferenceEncoder(cols=['gender', 'occupation'])
  bde_encoded = encoder.fit_transform(data)
  return bde_encoded

def minmax_scalar(data):
  scaler = MinMaxScaler() 
  data['scaled_age'] = scaler.fit_transform(data[['age']])
  return data

In [None]:
rating_data = pd.read_csv("ml-100k/ua.base", sep='\t',names="userId,movieId,rating,timestamp".split(",")) 
# need to sort the timestamp ascending?
rating_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


In [None]:
user_data = pd.read_csv("ml-100k/u.user", sep='|',names="userId,age,gender,occupation,zipcode".split(",")).set_index('userId') 
user_data

Unnamed: 0_level_0,age,gender,occupation,zipcode
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [None]:
raw_test = pd.read_csv("ml-100k/ua.test", sep='\t',names="userId,movieId,rating,timestamp".split(",")) 
raw_test

Unnamed: 0,userId,movieId,rating,timestamp
0,1,20,4,887431883
1,1,33,4,878542699
2,1,61,4,878542420
3,1,117,3,874965739
4,1,155,2,878542201
...,...,...,...,...
9425,943,232,4,888639867
9426,943,356,4,888639598
9427,943,570,1,888640125
9428,943,808,4,888639868


In [None]:
# Filter out the rating below 3 since we consider only high rated items those rating equal or above 3
indexNames = raw_test[raw_test['rating'] < 3 ].index
 
# Delete these row indexes from dataFrame
test_raw_data = raw_test.drop(indexNames)
test_raw_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,20,4,887431883
1,1,33,4,878542699
2,1,61,4,878542420
3,1,117,3,874965739
5,1,160,4,875072547
...,...,...,...,...
9423,943,186,5,888639478
9424,943,215,5,888639000
9425,943,232,4,888639867
9426,943,356,4,888639598


In [None]:
# aggregate all the movieId that have been rated with above 3 by all user
test_precision = test_raw_data.copy().groupby('userId')['movieId'].agg(actual = lambda x: list(set(x)))
test_precision

Unnamed: 0_level_0,actual
userId,Unnamed: 1_level_1
1,"[160, 33, 265, 202, 171, 20, 117, 189, 61]"
2,"[290, 312, 292, 297, 13, 50, 280, 281, 251]"
3,"[328, 331, 334, 343]"
4,"[288, 354, 260, 356, 294, 357, 264, 361, 303, 50]"
5,"[1, 2, 98, 363, 17]"
...,...
939,"[121, 258, 993, 1190, 106, 689, 409, 476, 1054]"
940,"[193, 66, 289, 873, 205, 14, 655, 272, 568, 315]"
941,"[257, 258, 993, 7, 15, 147, 117, 181, 475, 124]"
942,"[323, 261, 423, 200, 487, 584, 427, 615, 117, ..."


In [None]:
test_user_data = test_raw_data.drop_duplicates(subset = ["userId"])
test_user_data = test_user_data.join(user_data, on='userId')
encoded_test_raw = BackwardDifferenceEncoder(test_user_data)
encoded_test_raw

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,intercept,userId,movieId,rating,timestamp,age,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,zipcode
0,1,1,20,4,887431883,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
10,1,2,13,4,888551922,53,0.5,0.047619,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,94043
23,1,3,328,5,889237455,23,-0.5,0.047619,0.095238,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,32067
30,1,4,50,5,892003526,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,43537
40,1,5,1,4,875635748,33,0.5,0.047619,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,15213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9380,1,939,106,3,880262019,26,0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,33319
9390,1,940,14,3,885921710,32,-0.5,0.047619,0.095238,0.142857,0.190476,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,02215
9400,1,941,7,4,875048952,20,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,97229
9410,1,942,117,4,891282816,48,0.5,0.047619,0.095238,0.142857,0.190476,0.238095,0.285714,0.333333,0.380952,0.428571,0.47619,0.52381,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,78209


In [None]:
scaled_test_raw = minmax_scalar(encoded_test_raw).reset_index()
scaled_test_raw

Unnamed: 0,index,intercept,userId,movieId,rating,timestamp,age,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,zipcode,scaled_age
0,0,1,1,20,4,887431883,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
1,10,1,2,13,4,888551922,53,0.5,0.047619,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,94043,0.696970
2,23,1,3,328,5,889237455,23,-0.5,0.047619,0.095238,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,32067,0.242424
3,30,1,4,50,5,892003526,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,43537,0.257576
4,40,1,5,1,4,875635748,33,0.5,0.047619,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,15213,0.393939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,9380,1,939,106,3,880262019,26,0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,33319,0.287879
939,9390,1,940,14,3,885921710,32,-0.5,0.047619,0.095238,0.142857,0.190476,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,02215,0.378788
940,9400,1,941,7,4,875048952,20,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,97229,0.196970
941,9410,1,942,117,4,891282816,48,0.5,0.047619,0.095238,0.142857,0.190476,0.238095,0.285714,0.333333,0.380952,0.428571,0.47619,0.52381,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,78209,0.621212


In [None]:
movie_rated_by_user = rating_data.groupby('movieId')['userId'].count().reset_index(name='userIdCount')
movie_rated_by_user = movie_rated_by_user.set_index('movieId')
movie_rated_by_user

Unnamed: 0_level_0,userIdCount
movieId,Unnamed: 1_level_1
1,392
2,121
3,85
4,198
5,79
...,...
1678,1
1679,1
1680,1
1681,1


In [None]:
item_column_name = "movieId,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western"
item = pd.read_csv("ml-100k/u.item",sep='|',names=item_column_name.split(","),encoding='latin-1')
col_to_removed = ['movie_title', 'release_date', 'video_release_date', 'IMDb_URL']
clear_item = item.drop(col_to_removed, axis=1).set_index('movieId')

# the number of the kinds of topics in the whole recommendation system.
N = len(clear_item.columns)

# sum up the genre for each movieId
genre_num = clear_item.sum(axis = 1)

# group table by index and creates a dict with lists of clear_item as values
df_dict = dict(list(clear_item.groupby(clear_item.index)))

# Gather all the genres that related to all movies respectively
movieid = []
genre_list = []

for u, v in df_dict.items():
    check = v.columns[(v == 1).any()]
    if len(check) > 0:
      movieid.append(u)
      genre_list.append(check.to_list())

d = {'movieId' : movieid, 'Genres' : genre_list}

# compile in DataFrame
genre_list_by_movieid = pd.DataFrame(d)
genre_list_by_movieid.set_index('movieId', inplace=True)
genre_list_by_movieid

Unnamed: 0_level_0,Genres
movieId,Unnamed: 1_level_1
1,"[Animation, Children, Comedy]"
2,"[Action, Adventure, Thriller]"
3,[Thriller]
4,"[Action, Comedy, Drama]"
5,"[Crime, Drama, Thriller]"
...,...
1678,[Drama]
1679,"[Romance, Thriller]"
1680,"[Drama, Romance]"
1681,[Comedy]


In [None]:
movies_unique = np.unique(genre_list_by_movieid.index)
movies_unique

array([   1,    2,    3, ..., 1680, 1681, 1682])

In [None]:
raw = rating_data.join(user_data, on='userId')
raw.head()

Unnamed: 0,userId,movieId,rating,timestamp,age,gender,occupation,zipcode
0,1,1,5,874965758,24,M,technician,85711
1,1,2,3,876893171,24,M,technician,85711
2,1,3,4,878542960,24,M,technician,85711
3,1,4,3,876893119,24,M,technician,85711
4,1,5,3,889751712,24,M,technician,85711


In [None]:
# All categorical columns
object_cols = [col for col in raw.columns if raw[col].dtype == "object"]
object_cols

['gender', 'occupation', 'zipcode']

In [None]:
encoded_raw = BackwardDifferenceEncoder(raw)
encoded_raw

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,intercept,userId,movieId,rating,timestamp,age,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,zipcode
0,1,1,1,5,874965758,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
1,1,1,2,3,876893171,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
2,1,1,3,4,878542960,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
3,1,1,4,3,876893119,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
4,1,1,5,3,889751712,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90565,1,943,1047,2,875502146,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841
90566,1,943,1074,4,888640250,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841
90567,1,943,1188,3,888640250,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841
90568,1,943,1228,3,888640275,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841


In [None]:
encoded_raw = minmax_scalar(encoded_raw)
encoded_raw

Unnamed: 0,intercept,userId,movieId,rating,timestamp,age,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,zipcode,scaled_age
0,1,1,1,5,874965758,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
1,1,1,2,3,876893171,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
2,1,1,3,4,878542960,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
3,1,1,4,3,876893119,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
4,1,1,5,3,889751712,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90565,1,943,1047,2,875502146,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273
90566,1,943,1074,4,888640250,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273
90567,1,943,1188,3,888640250,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273
90568,1,943,1228,3,888640275,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273


In [None]:
drop_col = ['userId','index','intercept','movieId','rating','timestamp','age','zipcode']
test_data = scaled_test_raw.drop(drop_col, axis=1)
test_data

Unnamed: 0,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,scaled_age
0,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.257576
1,0.5,0.047619,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.696970
2,-0.5,0.047619,0.095238,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.242424
3,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.257576
4,0.5,0.047619,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.393939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.287879
939,-0.5,0.047619,0.095238,0.142857,0.190476,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.378788
940,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.196970
941,0.5,0.047619,0.095238,0.142857,0.190476,0.238095,0.285714,0.333333,0.380952,0.428571,0.47619,0.52381,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.621212


In [None]:
data_pivoted = encoded_raw.pivot_table(index='userId', columns='movieId', values='rating')
data_pivoted

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652,1654,1655,1656,1657,1658,1659,1660,1661,1662,1663,1664,1665,1666,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,2.0,5.0,5.0,5.0,5.0,5.0,3.0,4.0,5.0,,1.0,4.0,4.0,3.0,4.0,3.0,2.0,4.0,1.0,3.0,3.0,5.0,,2.0,1.0,2.0,2.0,3.0,4.0,3.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,4.0,,,,,,,,,2.0,,,,4.0,,,,,3.0,,,,,,4.0,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,3.0,,,4.0,3.0,,,,4.0,,,,,,,,,,,4.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
941,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
942,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
number_NaN = data_pivoted.isna().sum()
na_count = (len(data_pivoted)-data_pivoted.count()).sum()
number_value = np.product(data_pivoted.shape)
percentage_of_sparsity = (na_count / number_value) * 100

print('Before process sparsity...')
print('Sparsity Percentage: ',percentage_of_sparsity)

maximum = data_pivoted.max()
max_rating = maximum.max()
print('Max. Rating: ',max_rating)

minimum = data_pivoted.min()
min_rating = minimum.min()
print('Min. Rating: ',min_rating)

filled_data_pivoted = data_pivoted.fillna(0)
filled_data_pivoted

Before process sparsity...
Sparsity Percentage:  94.28306317224663
Max. Rating:  5.0
Min. Rating:  1.0


movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652,1654,1655,1656,1657,1658,1659,1660,1661,1662,1663,1664,1665,1666,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,2.0,5.0,5.0,5.0,5.0,5.0,3.0,4.0,5.0,0.0,1.0,4.0,4.0,3.0,4.0,3.0,2.0,4.0,1.0,3.0,3.0,5.0,0.0,2.0,1.0,2.0,2.0,3.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print('After process sparsity...')
number_NaN = filled_data_pivoted.isna().sum()
filled_na_count = (len(filled_data_pivoted)-filled_data_pivoted.count()).sum()
filled_number_value = filled_data_pivoted.shape[0] * filled_data_pivoted.shape[1]
after_filled_percentage_of_sparsity = (filled_na_count / filled_number_value) * 100
after_filled_maximum = filled_data_pivoted.max().max()
after_filled_minimum = filled_data_pivoted.min().min()
print("Sparsity after filled: " ,after_filled_percentage_of_sparsity)
print("Maximum rating: " ,after_filled_maximum )
print("Minimum rating: " ,after_filled_minimum)

After process sparsity...
Sparsity after filled:  0.0
Maximum rating:  5.0
Minimum rating:  0.0


need to preprocess to determine which movies have been high value rated by the user? No, if do so, then it will something similar as supervised learning or content-based filtering since it require prior information about interaction between user and movie.

In [None]:
drop_col = ['userId','intercept','movieId','rating','timestamp','age','zipcode']
train_data = encoded_raw.drop(drop_col, axis=1)

In [None]:
try_test_data = test_data[:10]
try_test_precision = test_precision[:10]
try_train = test_data[:10]

In [None]:
"""
Precision computation
"""

def precision_per_user(user):
  # get the actual data
  check_a = test_precision.iloc[user]
  # check intersect item
  result = float(len(np.intersect1d(check_a['actual'],check_a['predictions'])))

  # user set in test set
  # U = 
  # length of recommendation list
  L = len(check_a['predictions'])
  
  precision = (result/L)

  return precision

"""
Novelty computation
"""

def novelty_per_user(user):

  M = 943
  sum_novelty = 0
  movie_list = test_precision['predictions'].iloc[user]
  for u in movie_list:
    try:
      rate_number = movie_rated_by_user.loc[u]
      sum_novelty += math.log2(M/rate_number)
    except:
      sum_novelty += 0
  novelty = sum_novelty/len(movie_list)
  return novelty

"""
Diversity computation
"""

# this function is to check how many genre (topics) included in the movie item
def check_genre_num(movieid):
  n_genre = genre_num.loc[movieid]
  return n_genre

# this function is to get the list of genre that included in the movieId
def check_genre_list(movieid):
  movie_genres = genre_list_by_movieid['Genres'].loc[movieid]
  return movie_genres

# get the movie list per user and return the total genres for that list, z_Lu
def total_genre_per_list(user_movies_list):
  
  total_num_list = []
  
  for n in user_movies_list:
      genre = check_genre_list(n)
      for g in genre:
        total_num_list.append(g)
  
  return total_num_list, len(total_num_list)

# get the movie list per user and return the unique genres for that list, S_Lu
def get_unique_genre_list_by_user(user_movies_list):

  total_list = []
  
  for h in user_movies_list:
    g = check_genre_list(h)
    for k in g:
      total_list.append(k)
  
  unique_list = list(set(total_list))

  return unique_list, len(unique_list)

def H_lu(lu):
  slu, n_slu = get_unique_genre_list_by_user(lu)
  zlu, n_zlu = total_genre_per_list(lu)
  H_lu = 0
  # need to find how many times that topic j is appear in Z_Lu list
  for j in slu:
    count_appear = zlu.count(j)
    qj = count_appear/len(zlu)
    H_lu += -(qj * math.log10(qj))

  return H_lu

def Div_lu(Lu):
  sl , SLu = get_unique_genre_list_by_user(Lu)
  Nt = N
  hlu = H_lu(Lu)
  Div_lu = SLu / Nt * hlu

  return Div_lu

# get the userId list in prediction result
def diversity_per_user(user):
  value1 = 0
  movie_list = test_precision['predictions'].iloc[user]

  # total number of topics in recommendation list per user
  zlu, n_zlu = total_genre_per_list(movie_list)

  for item in movie_list:
    txi = check_genre_num(item)
    value1 += txi/n_zlu
    value2 = math.log10(txi/n_zlu)
  dot_part = value1*value2
  diversity_lu = -(dot_part) * Div_lu(movie_list)

  return diversity_lu

In [None]:
class DQNAgent:
  def __init__(self):
    self.memory = deque(maxlen = 1000) 
    self.gamma = 0.80
    self.epsilon = 0.1
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.999
    self.lr = 1e-3
    self.input_dims = train_data.shape[1]
    self.output_size = len(movies_unique)
    self.q_eval = self.Model(self.lr, self.input_dims, self.output_size)
    self.layer_size = 1024

  def Model(self, lr, input_dims, output_size):
    model = keras.Sequential()

    model.add(layers.Dense(1024, input_shape=(input_dims,), activation='relu'))
    model.add(layers.Dropout(0.2))

    model.add(layers.Dense(units = 2048))
    model.add(layers.Dropout(0.2))

    model.add(layers.Dense(output_size))
    opt = keras.optimizers.SGD(learning_rate=lr)
    model.compile(loss='MSE', optimizer=opt)

    return model

  def act(self, state):
      if np.random.random() < self.epsilon:
        return np.random.randint(self.output_size, size = (len(state)))
      else:
        actions = self.q_eval.predict(state)
        action = np.argmax(actions, axis = 1)
      return movies_unique[action]

  def memory_replay(self, batch_size):
    mini_batch = random.sample(self.memory, batch_size)
    replay_size = len(mini_batch)
    states = np.array([a[0] for a in mini_batch])
    new_states = np.array([a[3] for a in mini_batch])

    Q = self.q_eval.predict(states)
    Q_new = self.q_eval.predict(new_states)
    for i in range(len(mini_batch)):
        state, action, reward, next_state, done = mini_batch[i]
        target = Q[i]
        target[action] = reward
        if not done:
            target[action] += self.gamma * np.amax(Q_new[i])
        q_target = np.copy(Q)
        
        batch_index = np.arange(batch_size, dtype=np.int32)

        q_target[batch_index, action] = reward + \
                        self.gamma * np.max(Q_new, axis=1)*done

        state =  state.reshape(-1, 22)

        self.q_eval.train_on_batch(state, q_target)

    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
    return self.epsilon

agent = DQNAgent()

In [None]:
def get_users_predictions(test, n):
  o = agent.q_eval.predict(test)[0]
  p = o.argsort()[-n:][::-1]
  return [movies_unique[i] for i in p]

In [None]:
# memory = agent.memory
# mini_batch = random.sample(memory, 256)
# replay_size = len(mini_batch)
# states = np.array([a[0] for a in mini_batch])
# new_states = np.array([a[3] for a in mini_batch])

# Q = agent.q_eval.predict(states)
# Q_new = agent.q_eval.predict(new_states)

In [None]:
epoch = 1000
batch_size = 256
loss_record = [None] * epoch

all_avg_precision = []
all_avg_novelty = []
all_avg_diversity = []
reward_precision = []
reward_novelty = []
reward_diversity = []

for e in range(epoch):
  recs = []
    
  for i in test_data.index:
    temp = test_data[i:i+1]
    predictions = get_users_predictions(temp, len(test_precision.actual.iloc[i]))
    recs.append(predictions)

  test_precision['predictions'] = recs
  loss = []
  precision = []
  novelty = []
  diversity = []

  for ul in test_data.index:
    precision.append(precision_per_user(ul))
    p = np.mean(precision)
    r_precision = p * 1000

    novelty.append(novelty_per_user(ul))
    n = np.mean(novelty)
    r_novelty = 1 - n

    diversity.append(diversity_per_user(ul))
    d = np.mean(diversity)
    r_diversity = d * 10

  all_avg_precision.append(p)
  all_avg_novelty.append(n)
  all_avg_diversity.append(d)
  reward_precision.append(r_precision)
  reward_novelty.append(r_novelty)
  reward_diversity.append(r_diversity)

  for i in range(0, len(test_data) - 1, batch_size):
    index = min(i + batch_size, len(test_data) - 1)
    state = np.array(test_data[i: index])
    action = agent.act(state)
    next_state = np.array(test_data[i + 1: index + 1])
    
    for no, a in enumerate(action):
      if a >= 1682:
        r = 0
      else:
        r =  r_precision + r_novelty + r_diversity
      agent.memory.append((state[no], a, r, next_state[no], r  < 1))
    eps = agent.memory_replay(min(batch_size, len(agent.memory)))
    # loss.append(cost)
    # avg_lost = np.mean(loss)
    # loss_record[e]= avg_lost


  if e % 1 == 0:
    print('\nepoch: %d, epsilon: %f, precision: %f, novelty: %f, diversity: %f, r_p: %f, r_n: %f, r_d: %f,' %(e ,eps, p, n, d, r_precision, r_novelty, r_diversity))
    #print('\nepoch: %d, epsilon: %f, precision: %f, diversity: %f, r_p: %f, r_d: %f,' %(e ,eps, p, d, r_precision, r_diversity))


epoch: 0, epsilon: 0.099601, precision: 0.003128, novelty: 5.809673, diversity: 0.337363, r_p: 3.128314, r_n: -4.809673, r_d: 3.373635,

epoch: 1, epsilon: 0.099203, precision: 0.003128, novelty: 5.808868, diversity: 0.337093, r_p: 3.128314, r_n: -4.808868, r_d: 3.370927,

epoch: 2, epsilon: 0.098807, precision: 0.002996, novelty: 5.790252, diversity: 0.335808, r_p: 2.995758, r_n: -4.790252, r_d: 3.358080,

epoch: 3, epsilon: 0.098412, precision: 0.003128, novelty: 5.779617, diversity: 0.334858, r_p: 3.128314, r_n: -4.779617, r_d: 3.348584,

epoch: 4, epsilon: 0.098019, precision: 0.003128, novelty: 5.780428, diversity: 0.334581, r_p: 3.128314, r_n: -4.780428, r_d: 3.345807,

epoch: 5, epsilon: 0.097627, precision: 0.003128, novelty: 5.775265, diversity: 0.333192, r_p: 3.128314, r_n: -4.775265, r_d: 3.331917,

epoch: 6, epsilon: 0.097237, precision: 0.003128, novelty: 5.776166, diversity: 0.331337, r_p: 3.128314, r_n: -4.776166, r_d: 3.313368,

epoch: 7, epsilon: 0.096849, precision: 

In [None]:
precision

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1111111111111111,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.2,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1111111111111111,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [None]:
save_data = pd.DataFrame({'average overall precision' : all_avg_precision,
                         'average overall novelty' : all_avg_novelty,
                         'average overall diversity' : all_avg_diversity,
                         'reward precision' : reward_precision,
                         'reward novelty' : reward_novelty,
                         'reward diversity' : reward_diversity})

In [None]:
from google.colab import files
save_data.to_csv('tf2_DQN_run5_save_data.csv') 
files.download('tf2_DQN_run5_save_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("last epoch max precision:" ,max(precision))
print("last epoch min novelty:" ,min(novelty))
print("last epoch max diversity:" ,max(diversity))

last epoch max precision: 0.2222222222222222
last epoch min novelty: 4.432876304167915
last epoch max diversity: 0.23390353836981276


# **Extra**

In [None]:
def truncat_user(user_id):
    temp = raw[raw['userId'] == user_id]
    n = len(temp)
    print('Extract done...\n',n)
    return temp

In [None]:
# Further group numerical predictors such as "age" or "hours-per-week" to gain more insight!
# Group the "age" column
age_group = []
for age in income_df["age"]:
    if age < 25:
        age_group.append("<25")
    elif 25 <= age <= 34:
        age_group.append("25-34")
    elif 34 < age <= 44:
        age_group.append("35-44")
    elif 44 < age <= 54:
        age_group.append("45-54")
    elif 54 < age <= 65:
        age_group.append("55-64")
    else:
        age_group.append("65 and over")
new_income_df = income_df.copy()
new_income_df["age_group"] = age_group
del new_income_df["age"]

In [None]:
def convert(data, num_users, num_movies):
    ''' Making a User-Movie-Matrix'''
    
    new_data=[]
    
    for id_user in range(1, num_users+1):
        
        id_movie=data[:,1][data[:,0]==id_user]
        id_rating=data[:,2][data[:,0]==id_user]
        ratings=np.zeros(num_movies, dtype=np.uint32)
        ratings[id_movie-1]=id_rating
        if sum(ratings)==0:
            continue
        new_data.append(ratings)

        del id_movie
        del id_rating
        del ratings
        
    return new_data

In [None]:
def get_dataset():
    ''' For each train.dat and test.dat making a User-Movie-Matrix'''
    
    training_set=pd.read_csv("ml-100k/ua.base", sep='\t', header=None, engine='python', encoding='latin-1')
    training_set=np.array(training_set, dtype=np.uint32)
    
    test_set=pd.read_csv("ml-100k/ua.test", sep='\t', header=None, engine='python', encoding='latin-1')
    test_set=np.array(test_set, dtype=np.uint32)
    
      
    num_users=int(max(max(training_set[:,0]), max(test_set[:,0])))
    num_movies=int(max(max(training_set[:,1]), max(test_set[:,1])))

    training_set=convert(training_set,num_users, num_movies)
    test_set=convert(test_set,num_users, num_movies)
    
    return training_set, test_set

In [None]:
nani= get_dataset()
nani

In [None]:
training_set=pd.read_csv("ml-100k/ua.base", sep='\t', header=None, engine='python', encoding='latin-1')
training_set=np.array(training_set, dtype=np.uint32)
training_set

array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]], dtype=uint32)

In [None]:
test_set=pd.read_csv("ml-100k/ua.test", sep='\t', header=None, engine='python', encoding='latin-1')
test_set=np.array(test_set, dtype=np.uint32)

In [None]:
test_set

array([[        1,        20,         4, 887431883],
       [        1,        33,         4, 878542699],
       [        1,        61,         4, 878542420],
       ...,
       [      943,       570,         1, 888640125],
       [      943,       808,         4, 888639868],
       [      943,      1067,         2, 875501756]], dtype=uint32)

In [None]:
num_users=int(max(max(training_set[:,0]), max(test_set[:,0])))
num_movies=int(max(max(training_set[:,1]), max(test_set[:,1])))

In [None]:
training_set=convert(training_set,num_users, num_movies)
test_set=convert(test_set,num_users, num_movies)

In [None]:
training_set

[array([5, 3, 4, ..., 0, 0, 0], dtype=uint32),
 array([4, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([4, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([4, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([3, 3, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([1, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([5, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([5, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([3, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([5, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 2,