In [None]:
!pip install tensorflow-text

In [2]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d aminesedkaoui/epinions-ratings-500k

Downloading epinions-ratings-500k.zip to /content
  0% 0.00/3.25M [00:00<?, ?B/s]
100% 3.25M/3.25M [00:00<00:00, 150MB/s]


In [4]:
from scipy.sparse import csr_matrix
import tensorflow as tf 
import numpy as np
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import random
from keras import datasets, layers, models

In [4]:
!unzip "/content/epinions-ratings-500k.zip" -d "/content/"

Archive:  /content/epinions-ratings-500k.zip
  inflating: /content/epinions_rating_500k.csv  


In [None]:
df = pd.read_csv('/content/epinions_rating_500k.csv')
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

In [6]:
def DatasetToUserItemDataFrame(dataframe,userID,itemID,rating):
  #Setting new item IDs from string to int 
  itemKeys = [] 
  i = 0
  for item in dataframe[itemID].value_counts(sort=False):
    temp = np.full((item),i)
    itemKeys = np.append(itemKeys,temp)
    i += 1

  #Setting new user IDs from string to int
  userKeysDic = {}
  userKeys = np.zeros((dataframe[userID].size))
  i = 0
  for user in dataframe[userID].unique():
    userKeysDic[user] = i
    i += 1
  i = 0
  for user in dataframe[userID]:
    userKeys[i] = userKeysDic[user]
    i += 1

  #Converting arrays from float to int 
  userKeys = userKeys.astype(int)
  itemKeys = itemKeys.astype(int)

  
  user_item = csr_matrix((dataframe[rating].values.astype(int),(userKeys,itemKeys))) #Creating sparse matrix
  user_item_matrix = user_item.toarray() #Converting sparse matrix into array
  df_user_item = pd.DataFrame(user_item_matrix,index = dataframe[userID].unique()  ,columns = dataframe[itemID].unique() ) 

  return user_item,df_user_item

In [7]:
def preprocess_df(df):
  users = df['User_ID'].unique()
  items = df['Item_ID'].unique()
  df_train = df.copy()

  New_User_IDs = random.sample(range(10000,50000),df['User_ID'].nunique())
  New_Item_IDs = random.sample(range(100000,200000),df['Item_ID'].nunique())
  i = 0
  for d in users :
    df_train['User_ID'].replace({d : New_User_IDs[i]}, inplace=True)
    i+=1

  i = 0
  for d in items :
    df_train['Item_ID'].replace({d : New_Item_IDs[i]}, inplace=True)
    i+=1
  
  df_train['one']=df_train['rating'].apply(lambda x: 1 if x==1 else 0)
  df_train['two']=df_train['rating'].apply(lambda x: 1 if x==2 else 0)
  df_train['three']=df_train['rating'].apply(lambda x: 1 if x==3 else 0)
  df_train['four']=df_train['rating'].apply(lambda x: 1 if x==4 else 0)
  df_train['five']=df_train['rating'].apply(lambda x: 1 if x==5 else 0)
  df_train['six']=df_train['rating'].apply(lambda x: 1 if x==6 else 0)

  return df_train

In [205]:
from numpy.core.fromnumeric import size
#GMF

def Create_GMF(num_users,num_items,SIZE_):

  input_userID = layers.Input(shape=[1], name='user_ID')
  input_itemID = layers.Input(shape=[1], name='item_ID')

  user_emb_GMF = layers.Embedding(num_users, SIZE_, name='user_emb_GMF')(input_userID)
  item_emb_GMF = layers.Embedding(num_items, SIZE_, name='item_emb_GMF')(input_itemID)

  u_GMF = layers.Flatten()(user_emb_GMF)
  i_GMF = layers.Flatten()(item_emb_GMF)

  dot_layer = layers.Multiply()([u_GMF, i_GMF])

  out_layer = layers.Dense(6, activation='softmax', name='output')(dot_layer)

  GMF = tf.keras.Model([input_userID, input_itemID], out_layer)
  
  return GMF

def Train_GMF(model,X_train,Y_train,nbrEpochs):
  model.compile(optimizer = 'adam',
                    loss = tf.keras.losses.categorical_crossentropy ,
                    metrics=['accuracy'])
  model.fit(X_train,Y_train,epochs = nbrEpochs)
  return model

def user_item_ID_lists(userIDs,itemIDs):

  item_s = pd.Series()
  user_s = pd.Series()
  for user in userIDs:
    temp = pd.Series(itemIDs)
    item_s = item_s.append(temp)
    temp = []
    temp = [user for item in itemIDs]
    temp = pd.Series(temp)
    user_s = user_s.append(temp)
  return user_s,item_s

def Fill_Cf_Matrix(model,userList,itemList,userIDs,itemIDs):

  prediction = model.predict([userList,itemList],verbose = 0)
  i = 0
  row = []
  matrix = []
  print("pred done")
  
  while i < userList.shape[0]:
    result = np.where(prediction[i] == np.amax(prediction[i]))[0][0] + 1
    row.append(result)
    if (i % 10000) == 0:
        print( str(i) +"== row ==")
    if len(row) == itemIDs.size:
      matrix.append(row)
      row = []
       
    i += 1

  matrix_arr = np.array(matrix)
  dataframe = pd.DataFrame(matrix_arr, index = userIDs, columns = itemIDs)
  # for user in userIDs:
  #   user_s = pd.Series(user)
  #   for item in itemIDs:
  #     item_s = pd.Series(item)
  #     if dataframe.loc[user][item] == 0:
  #       prediction = model.predict([user_s,item_s],verbose = 0)
  #       result = np.where(prediction[0] == np.amax(prediction[0]))[0][0] + 1
  #       dataframe.loc[user][item] = result
  return dataframe


In [8]:
df_train = preprocess_df(df)

In [9]:
GMF = Create_GMF(df_train['User_ID'].max() + 1,df_train['Item_ID'].max() + 1,64)

In [182]:
from sklearn.model_selection import train_test_split
df_train_sample = df_train[:2000]
X_train, X_test,y_train,y_test= train_test_split(df_train_sample[['User_ID','Item_ID']],df_train_sample[['one','two','three','four','five','six']],stratify=df_train_sample[['one','two','three','four','five','six']])
#X_train, X_test,y_train,y_test= train_test_split(df_train[['User_ID','Item_ID']],df_train['rating'],stratify=df_train['rating'])

In [None]:
X_train

In [131]:
GMF_trained = Train_GMF(GMF,[X_train['User_ID'],X_train['Item_ID']],y_train,5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [60]:
user_item_matrix , df_mat = DatasetToUserItemDataFrame(df_train_sample,'User_ID','Item_ID','rating')

In [15]:
GMF_trained.evaluate([X_test['User_ID'],X_test['Item_ID']],y_test)



[0.15614736080169678, 0.9539999961853027]

In [191]:
New_User_IDs = df_train_sample.User_ID.unique()
New_Item_IDs = df_train_sample.Item_ID.unique()

In [192]:
New_Item_IDs.size * New_User_IDs.size

98838

In [201]:
user_s,item_s = user_item_ID_lists(New_User_IDs,New_Item_IDs)



In [204]:
df_mat_filled = Fill_Cf_Matrix(GMF_trained,user_s,item_s,New_User_IDs,New_Item_IDs)

pred done
98838
0== row ==
1000== row ==
2000== row ==
3000== row ==
4000== row ==
5000== row ==
6000== row ==
7000== row ==
8000== row ==
9000== row ==
10000== row ==
11000== row ==
12000== row ==
13000== row ==
14000== row ==
15000== row ==
16000== row ==
17000== row ==
18000== row ==
19000== row ==
20000== row ==
21000== row ==
22000== row ==
23000== row ==
24000== row ==
25000== row ==
26000== row ==
27000== row ==
28000== row ==
29000== row ==
30000== row ==
31000== row ==
32000== row ==
33000== row ==
34000== row ==
35000== row ==
36000== row ==
37000== row ==
38000== row ==
39000== row ==
40000== row ==
41000== row ==
42000== row ==
43000== row ==
44000== row ==
45000== row ==
46000== row ==
47000== row ==
48000== row ==
49000== row ==
50000== row ==
51000== row ==
52000== row ==
53000== row ==
54000== row ==
55000== row ==
56000== row ==
57000== row ==
58000== row ==
59000== row ==
60000== row ==
61000== row ==
62000== row ==
63000== row ==
64000== row ==
65000== row ==
66000==

In [175]:
user_s_100K , item_s_100k = user_s, item_s

In [172]:
df_mat_filled.to_csv('epinions_user_item_filled_100K_matrix_100k_leaning.csv',index = False)

In [173]:
df_mat_filled.shape

(2895, 996)

In [137]:
#Autoencoder

encoder_input = layers.Input(shape=(df_mat_filled.shape[1]),name='user_item')
flat = layers.Flatten()(encoder_input)
hid_encoder = layers.Dense(900,activation="relu")(flat)
encoder_output = layers.Dense(800,activation="relu")(hid_encoder)

decoder_input = layers.Dense(900,activation="relu")(encoder_output)
decoder_output = layers.Dense(df_mat_filled.shape[1],activation="relu")(decoder_input)

autoencoder = tf.keras.Model(inputs = encoder_input, outputs = decoder_output)

In [138]:
autoencoder.compile(optimizer = 'adam',
                    loss = 'mse',
                    metrics=['accuracy'])

In [139]:
autoencoder.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 user_item (InputLayer)      [(None, 996)]             0         
                                                                 
 flatten_8 (Flatten)         (None, 996)               0         
                                                                 
 dense_24 (Dense)            (None, 900)               897300    
                                                                 
 dense_25 (Dense)            (None, 800)               720800    
                                                                 
 dense_26 (Dense)            (None, 900)               720900    
                                                                 
 dense_27 (Dense)            (None, 996)               897396    
                                                                 
Total params: 3,236,396
Trainable params: 3,236,396
Non-tra

In [140]:
X_train , X_test = train_test_split(df_mat_filled)

In [None]:
X_train

In [142]:
autoencoder.fit(X_train, X_train, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff136ee1890>