In [46]:
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


In [2]:
df_users = pd.read_table('/content/drive/MyDrive/Machine Learning/ml-100k/u.user', sep='|')
df_users.rename(columns = {'1':'user id', '24':'age', 'M':'gender', 'technician':'occupation', '85711':'zip code'}, inplace = True)

In [3]:
df_users.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,53,F,other,94043
1,3,23,M,writer,32067
2,4,24,M,technician,43537
3,5,33,F,other,15213
4,6,42,M,executive,98101


In [4]:
mat = df_users.to_numpy()
mat

array([[2, 53, 'F', 'other', '94043'],
       [3, 23, 'M', 'writer', '32067'],
       [4, 24, 'M', 'technician', '43537'],
       ...,
       [941, 20, 'M', 'student', '97229'],
       [942, 48, 'F', 'librarian', '78209'],
       [943, 22, 'M', 'student', '77841']], dtype=object)

In [5]:
occupation = mat[:,3]
occupation

array(['other', 'writer', 'technician', 'other', 'executive',
       'administrator', 'administrator', 'student', 'lawyer', 'other',
       'other', 'educator', 'scientist', 'educator', 'entertainment',
       'programmer', 'other', 'librarian', 'homemaker', 'writer',
       'writer', 'artist', 'artist', 'engineer', 'engineer', 'librarian',
       'writer', 'programmer', 'student', 'artist', 'student', 'student',
       'administrator', 'homemaker', 'student', 'student', 'other',
       'entertainment', 'scientist', 'engineer', 'administrator',
       'librarian', 'technician', 'programmer', 'marketing', 'marketing',
       'administrator', 'student', 'writer', 'educator', 'student',
       'programmer', 'executive', 'programmer', 'librarian', 'none',
       'programmer', 'educator', 'healthcare', 'engineer',
       'administrator', 'marketing', 'educator', 'educator', 'student',
       'student', 'student', 'engineer', 'engineer', 'scientist',
       'administrator', 'student', 'scien

In [6]:
mlb1 = MultiLabelBinarizer()
mlb1.fit([occupation])
mlb1.classes_



array(['administrator', 'artist', 'doctor', 'educator', 'engineer',
       'entertainment', 'executive', 'healthcare', 'homemaker', 'lawyer',
       'librarian', 'marketing', 'none', 'other', 'programmer', 'retired',
       'salesman', 'scientist', 'student', 'technician', 'writer'],
      dtype=object)

In [7]:
# df_users['occupation'] = df_users['occupation'].apply(lambda x: mlb.transform([x])[0])


In [8]:
for i in range(df_users.shape[0]):
  x = [df_users['occupation'][i]]
  y = mlb1.transform([x])
  df_users['occupation'][i] = y[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users['occupation'][i] = y[0]


In [9]:
df_users['occupation'][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [10]:

df_users

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,53,F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",94043
1,3,23,M,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",32067
2,4,24,M,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43537
3,5,33,F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",15213
4,6,42,M,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",98101
...,...,...,...,...,...
937,939,26,F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33319
938,940,32,M,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",02215
939,941,20,M,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",97229
940,942,48,F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",78209


In [11]:
df_users['age'].max()

73

In [12]:
df_users['age'].min()

7

Therefore users' age ranges from 7 to 73

In [13]:
for i in range(df_users.shape[0]):
  x = df_users['age'][i]
  df_users['age'][i] = df_users['age'][i] // 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users['age'][i] = df_users['age'][i] // 10


In [14]:
df_users['age'].max()

7

In [15]:
age = df_users['age']

mlb2 = MultiLabelBinarizer()
mlb2.fit([age])
mlb2.classes_



array([0, 1, 2, 3, 4, 5, 6, 7])

In [16]:
x = [df_users['age'][0]]
y = mlb2.transform([x])
y[0]

array([0, 0, 0, 0, 0, 1, 0, 0])

In [17]:
df_users['age'] = df_users['age'].apply(lambda x: mlb2.transform([[x]])[0])


In [18]:
df_users['age']

0      [0, 0, 0, 0, 0, 1, 0, 0]
1      [0, 0, 1, 0, 0, 0, 0, 0]
2      [0, 0, 1, 0, 0, 0, 0, 0]
3      [0, 0, 0, 1, 0, 0, 0, 0]
4      [0, 0, 0, 0, 1, 0, 0, 0]
                 ...           
937    [0, 0, 1, 0, 0, 0, 0, 0]
938    [0, 0, 0, 1, 0, 0, 0, 0]
939    [0, 0, 1, 0, 0, 0, 0, 0]
940    [0, 0, 0, 0, 1, 0, 0, 0]
941    [0, 0, 1, 0, 0, 0, 0, 0]
Name: age, Length: 942, dtype: object

In [19]:
df_users

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,"[0, 0, 0, 0, 0, 1, 0, 0]",F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",94043
1,3,"[0, 0, 1, 0, 0, 0, 0, 0]",M,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",32067
2,4,"[0, 0, 1, 0, 0, 0, 0, 0]",M,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43537
3,5,"[0, 0, 0, 1, 0, 0, 0, 0]",F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",15213
4,6,"[0, 0, 0, 0, 1, 0, 0, 0]",M,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",98101
...,...,...,...,...,...
937,939,"[0, 0, 1, 0, 0, 0, 0, 0]",F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33319
938,940,"[0, 0, 0, 1, 0, 0, 0, 0]",M,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",02215
939,941,"[0, 0, 1, 0, 0, 0, 0, 0]",M,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",97229
940,942,"[0, 0, 0, 0, 1, 0, 0, 0]",F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",78209


In [20]:
gender = df_users['gender']
gender


0      F
1      M
2      M
3      F
4      M
      ..
937    F
938    M
939    M
940    F
941    M
Name: gender, Length: 942, dtype: object

In [21]:
mlb3 = MultiLabelBinarizer()
mlb3.fit([gender])
mlb3.classes_

array(['F', 'M'], dtype=object)

In [22]:
df_users['gender'] = df_users['gender'].apply(lambda x: mlb3.transform([[x]])[0])


In [23]:
df_users

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,"[0, 0, 0, 0, 0, 1, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",94043
1,3,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",32067
2,4,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43537
3,5,"[0, 0, 0, 1, 0, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",15213
4,6,"[0, 0, 0, 0, 1, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",98101
...,...,...,...,...,...
937,939,"[0, 0, 1, 0, 0, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33319
938,940,"[0, 0, 0, 1, 0, 0, 0, 0]","[0, 1]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",02215
939,941,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",97229
940,942,"[0, 0, 0, 0, 1, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",78209


In [24]:
df_users

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,"[0, 0, 0, 0, 0, 1, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",94043
1,3,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",32067
2,4,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43537
3,5,"[0, 0, 0, 1, 0, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",15213
4,6,"[0, 0, 0, 0, 1, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",98101
...,...,...,...,...,...
937,939,"[0, 0, 1, 0, 0, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33319
938,940,"[0, 0, 0, 1, 0, 0, 0, 0]","[0, 1]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",02215
939,941,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",97229
940,942,"[0, 0, 0, 0, 1, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",78209


In [25]:
df_users

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,"[0, 0, 0, 0, 0, 1, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",94043
1,3,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",32067
2,4,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43537
3,5,"[0, 0, 0, 1, 0, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",15213
4,6,"[0, 0, 0, 0, 1, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",98101
...,...,...,...,...,...
937,939,"[0, 0, 1, 0, 0, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33319
938,940,"[0, 0, 0, 1, 0, 0, 0, 0]","[0, 1]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",02215
939,941,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",97229
940,942,"[0, 0, 0, 0, 1, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",78209


In [26]:
## generating feature_map
user_feature_map = {}
# for idx, row in tqdm(df_users.iterrows()):
#     user_feature_map[row["user id"]] = list(row["age"]) + list(row["gender"]) + [row["occupation"]]

for i in range(df_users.shape[0]):
  user_feature_map[df_users['user id'][i]] = list(df_users['age'][i]) + list(df_users['gender'][i]) + list(df_users['occupation'][i])

In [27]:
df_users.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,"[0, 0, 0, 0, 0, 1, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",94043
1,3,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",32067
2,4,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43537
3,5,"[0, 0, 0, 1, 0, 0, 0, 0]","[1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",15213
4,6,"[0, 0, 0, 0, 1, 0, 0, 0]","[0, 1]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",98101


In [28]:
df_rating = pd.read_table('/content/drive/MyDrive/Machine Learning/ml-100k/u.data')
df_rating.rename(columns = {'196':'user id', '242':'movie id', '3':'rating', '881250949':'timestamp'}, inplace = True)

In [29]:
df_rating.head()

Unnamed: 0,user id,movie id,rating,timestamp
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806


In [30]:
df_rating['user_features'] = df_rating['user id'].apply(lambda x: np.array(user_feature_map.get(x)))


In [31]:
df_rating['user_features'][0]

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
df_rating['user_features'][0].shape[0]

31

In [33]:
df_rating = df_rating.drop(df_rating[df_rating['user_features'].isnull()].index)

In [34]:
df_rating = df_rating.drop(df_rating[df_rating['user id'].isnull()].index)

In [35]:
df_rating['movie id'].isnull().sum()

0

In [36]:
df_rating['user id'].unique().sum()

445095

In [37]:
df_rating['user_features'].isnull().sum()

0

In [39]:
# count = 0
# for i in range(df_rating['user_features'].shape[0]):
#     count = min(count, df_rating['user_features'][i].shape[0])
# print(count)

In [40]:
df_rating

Unnamed: 0,user id,movie id,rating,timestamp,user_features
0,186,302,3,891717742,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,22,377,1,878887116,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,244,51,2,880606923,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,166,346,1,886397596,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ..."
4,298,474,4,884182806,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
99994,880,476,3,880175444,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
99995,716,204,5,879795543,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
99996,276,1090,1,874795795,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
99997,13,225,2,882399156,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ..."


In [41]:
len(df_rating['user_features'][0])

31

In [42]:
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Embedding, Dot, Concatenate, Add, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

Instructions for updating:
non-resource variables are not supported in the long term


In [47]:
df_rating_train, df_rating_test = train_test_split(df_rating, test_size=0.1, stratify=df_rating['user id'], random_state=93)


X_train =[np.array([np.array(row) for row in df_rating_train['user_features']]),
          np.array([row for row in df_rating_train['user id']]),
          np.array([row for row in df_rating_train['movie id']])
          
          ]

y_train = df_rating_train['rating'].values



X_test =[np.array([np.array(row) for row in df_rating_test['user_features']]),
          np.array([row for row in df_rating_test['user id']]),
          np.array([row for row in df_rating_test['movie id']])
          
          ]

y_test = df_rating_test['rating'].values

In [48]:
X_train[0][2]

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

In [49]:
df_rating_train.head()

Unnamed: 0,user id,movie id,rating,timestamp,user_features
83985,545,233,4,879899380,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
60588,405,1073,1,885548578,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
74594,807,257,4,893084232,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
56121,16,15,5,877722001,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
44153,501,125,3,883348435,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [50]:
y_train[0]

4

In [51]:
def create_model(n_users, user_embed_size_dot, item_embed_size_concat, n_items, item_embed_size, user_feature_len, regularization=1e-4):
  user_features = Input(shape=(user_feature_len, ), name="user_features")
  user_inp = Input(shape=(1, ), dtype='int32', name="user_embed")
  user_embed = Embedding(n_users,
                         user_embed_size_dot,
                         name='user_embed_mat',
                         embeddings_initializer='glorot_uniform',
                         embeddings_regularizer=keras.regularizers.l2(regularization))(user_inp)

  user_embed_bias = Embedding(n_users, 
                              1,
                              name='user_embed_bias_mat',
                              embeddings_initializer='glorot_uniform')(user_inp)
  

  # item embeddings

  item_inp = Input(shape=(1, ), dtype='int32', name='item_embed')
  item_embed = Embedding(n_items,
                         item_embed_size,
                         name='item_embed_mat',
                         embeddings_initializer='glorot_uniform',
                         embeddings_regularizer=keras.regularizers.l2(regularization))(item_inp)
  
  item_embed_bias = Embedding(n_items,
                              1,
                              name='item_embed_bias_mat',
                              embeddings_initializer='glorot_uniform')(item_inp)

  
  user_item_dot = Dot(axes=2, name='user_item_dot')([user_embed, item_embed])
  user_item_dot = Add()([user_item_dot, user_embed_bias, item_embed_bias])




  hidden1 = Dense(8, activation='relu')(user_features)
  hidden1 = BatchNormalization()(hidden1)
  hidden1 = Dropout(0.2)(hidden1)


  output = Dense(1, activation='relu')(hidden1)

  model =Model([user_features, user_inp, item_inp], output)
  
  return model



In [52]:
N_USERS = df_rating['user id'].max() + 1
N_ITEMS = df_rating['movie id'].max() + 1
USER_EMBEDDING_SIZE_DOT = 20
ITEM_EMBEDDING_SIZE_CONCAT = 31
ITEM_EMBEDDING_SIZE = 20
USER_FEATURE_LEN = 31


model = create_model(N_USERS, USER_EMBEDDING_SIZE_DOT, ITEM_EMBEDDING_SIZE_CONCAT, N_ITEMS, ITEM_EMBEDDING_SIZE, USER_FEATURE_LEN)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_features (InputLayer)     [(None, 31)]         0           []                               
                                                                                                  
 dense (Dense)                  (None, 8)            256         ['user_features[0][0]']          
                                                                                                  
 batch_normalization (BatchNorm  (None, 8)           32          ['dense[0][0]']                  
 alization)                                                                                       
                                                                                                  
 dropout (Dropout)              (None, 8)            0           ['batch_normalization[0][0]']

In [53]:
model.compile(Adam(1e-3), loss="mse", metrics=["mae"])

In [54]:
# callbacks defined

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 5
    lrate = initial_lrate * (drop**((1 + epoch)/epochs_drop))
    return lrate

lrate_scheduler = LearningRateScheduler(step_decay)
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_chkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

# model fitting
callbacks=[early_stop, model_chkpoint, lrate_scheduler]
model.fit(X_train, y_train, batch_size=32, epochs=50, validation_split=0.1)

Train on 80778 samples, validate on 8976 samples
Epoch 1/50

  updates = self.state_updates


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fa0b624bc70>

In [55]:
model.predict(X_test)

  updates=self.state_updates,


array([[3.5071754],
       [3.536338 ],
       [3.446598 ],
       ...,
       [3.4785929],
       [3.551259 ],
       [3.6669233]], dtype=float32)

In [56]:
df_rating_test['prediction'] = [t[0] for t in model.predict(X_test)]

In [57]:
df_rating_test.head()

Unnamed: 0,user id,movie id,rating,timestamp,user_features,prediction
77331,497,176,4,879310762,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",3.507175
11971,210,257,5,887730789,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...",3.536338
4387,221,895,2,885081339,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",3.446598
27676,215,50,5,891436543,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",3.556072
90952,933,166,3,874854062,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",3.507175
