In [None]:
!pip install tensorflow-text

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d aminesedkaoui/epinions-ratings-500k

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading epinions-ratings-500k.zip to /content
  0% 0.00/3.25M [00:00<?, ?B/s]
100% 3.25M/3.25M [00:00<00:00, 86.8MB/s]


In [4]:
from scipy.sparse import csr_matrix
import tensorflow as tf 
import numpy as np
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
from tensorflow.keras import datasets, layers, models

In [6]:
!unzip "/content/epinions-ratings-500k.zip" -d "/content/"

Archive:  /content/epinions-ratings-500k.zip
  inflating: /content/epinions_rating_500k.csv  


In [8]:
df = pd.read_csv('/content/epinions_rating_500k.csv')
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,Item_ID,User_ID,rating
0,139431556,1312460676,5
1,139431556,204358,5
2,139431556,368725,5
3,139431556,277629,5
4,139431556,246386,5


In [None]:
df.shape

In [None]:
df["Item_ID"].value_counts(sort=False)

In [6]:
df['User_ID'].nunique()

23982

In [9]:
def DatasetToUserItemDataFrame(dataframe,userID,itemID,rating):
  #Setting new item IDs from string to int 
  itemKeys = [] 
  i = 0
  for item in dataframe[itemID].value_counts(sort=False):
    temp = np.full((item),i)
    itemKeys = np.append(itemKeys,temp)
    i += 1

  #Setting new user IDs from string to int
  userKeysDic = {}
  userKeys = np.zeros((dataframe[userID].size))
  i = 0
  for user in dataframe[userID].unique():
    userKeysDic[user] = i
    i += 1
  i = 0
  for user in dataframe[userID]:
    userKeys[i] = userKeysDic[user]
    i += 1

  #Converting arrays from lfoat to int 
  userKeys = userKeys.astype(int)
  itemKeys = itemKeys.astype(int)

  
  user_item = csr_matrix((dataframe[rating].values.astype(int),(userKeys,itemKeys))) #Creating sparse matrix
  user_item_matrix = user_item.toarray() #Converting sparse matrix into array
  df_user_item = pd.DataFrame(user_item_matrix,index = dataframe[userID].unique()  ,columns = dataframe[itemID].unique() ) 

  return user_item,df_user_item

In [None]:
user_item_matrix , df_mat = DatasetToUserItemDataFrame(df,'User_ID','Item_ID','rating')

In [None]:
user_item_matrix.shape

In [None]:
df_mat.head()

In [20]:
df_mat.loc[1312460676][139431556] = 5

In [35]:
#Train/Test data split
from sklearn.model_selection import train_test_split
X_train, X_test= train_test_split(df_mat)

In [None]:
X_train = X_train.toarray()

In [None]:
X_train.shape

In [None]:
X_train.shape[0] * X_train.shape[1]

In [None]:
np.count_nonzero(X_train)/(X_train.shape[0]*X_train.shape[1]) * 100

In [11]:
from tensorflow.keras.preprocessing.text import one_hot

for d in df['User_ID'].unique():
   df['User_ID'].replace({d : one_hot(d.astype(str),df['User_ID'].nunique()) }, inplace=True)

for d in df['Item_ID'].unique():
   df['Item_ID'].replace({d : one_hot(d.astype(str),df['Item_ID'].nunique()) }, inplace=True)
   

In [35]:
one_hot('1312460566',df['User_ID'].nunique())

[267]

In [34]:
df['User_ID'].nunique()

23982

In [None]:
df.head()

In [None]:
df['User_ID'].replace(1312460676,5555)

In [None]:
df.head()

In [13]:
#GMF
def Create_GMF(num_users,num_items,SIZE_):

  input_userID = layers.Input(shape=[1], name='user_ID')
  input_itemID = layers.Input(shape=[1], name='item_ID')

  user_emb_GMF = layers.Embedding(num_users, SIZE_, name='user_emb_GMF')(input_userID)
  item_emb_GMF = layers.Embedding(num_items, SIZE_, name='item_emb_GMF')(input_itemID)

  u_GMF = layers.Flatten()(user_emb_GMF)
  i_GMF = layers.Flatten()(item_emb_GMF)

  dot_layer = layers.Multiply()([u_GMF, i_GMF])

  out_layer = layers.Dense(1, activation='sigmoid', name='output')(dot_layer)

  GMF = tf.keras.Model([input_userID, input_itemID], out_layer)
  
  return GMF

def Train_GMF(model,X_train,Y_train,nbrEpochs):
  model.compile(optimizer = 'adam',
                    loss = 'mae',
                    metrics=['accuracy'])
  model.fit(X_train,Y_train,epochs = nbrEpochs)
  return model

def Fill_Cf_Matrix(model,dataframe,userID,itemID):
  for user in dataframe[userID].unique():
    for item in dataframe[itemID].unique():
      if dataframe.loc[itemID][userID] == 0:
        dataframe.loc[itemID][userID] = model.predict([user,item],verbose=1)
  
  return dataframe


In [16]:
GMF = Create_GMF(df['User_ID'].max() + 1,df['Item_ID'].max() + 1,64)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test= train_test_split(df[['User_ID','Item_ID']],df['rating'],stratify=df['rating'])

In [None]:
y_train

In [17]:
GMF_trained = Train_GMF(GMF,[X_train['User_ID'],X_train['Item_ID']],y_train,5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#Autoencoder

encoder_input = layers.Input(shape=(X_train.shape[1]),name='user_item')
flat = layers.Flatten()(encoder_input)
hid_encoder = layers.Dense(512,activation="relu")(flat)
encoder_output = layers.Dense(256,activation="relu")(hid_encoder)

decoder_input = layers.Dense(512,activation="relu")(encoder_output)
decoder_output = layers.Dense(X_train.shape[1],activation="relu")(decoder_input)

autoencoder = tf.keras.Model(inputs = encoder_input, outputs = decoder_output)

In [None]:
autoencoder.compile(optimizer = 'adam',
                    loss = 'mae',
                    metrics=['accuracy'])

In [None]:
autoencoder.summary()

In [None]:
autoencoder.fit(X_train, X_train, epochs = 5)