<a href="https://colab.research.google.com/github/AmineSdk/RecommenderSystem/blob/main/CF_Social_Autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow-text

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d aminesedkaoui/yelp-user-user-trust
! kaggle datasets download -d aminesedkaoui/yelp-review-ratings

In [3]:
from sklearn import preprocessing
from scipy.sparse import csr_matrix
import tensorflow as tf 
import numpy as np
import tensorflow_hub as hub
import pandas as pd
import random
from keras import datasets, layers, models
from sklearn.model_selection import train_test_split
from progressbar import progressbar

In [None]:
DATA = ['yelp_GMF_rating_mat','yelp_rating_mat','yelp_user_mat']
MODEL_PATH = []
PATH = ''
METRICS = ['accuracy','mae','rmse','precision']

In [None]:
!unzip "/content/yelp-user-user-trust.zip" -d "/content/"
!unzip "/content/yelp-review-ratings.zip" -d "/content/"

In [None]:
df_user_user = pd.read_csv('/content/yelp_user_user_trust.csv')
df_rating = pd.read_csv('/content/yelp_review_ratings.csv')
df_rating.rename(columns = {'text': 'reviewText', 'stars': 'rating', 'business_id': 'itemID', 'user_id': 'userID'}, inplace = True)
df_rating = df_rating.drop(['Unnamed: 0'],axis=1)
df_rating.head()

In [6]:
def DatasetToUserItemDataFrame(dataframe,userID,itemID,rating):
  #Setting new item IDs from string to int 
  itemKeys = [] 
  i = 0
  for item in dataframe[itemID].value_counts(sort=False):
    temp = np.full((item),i)
    itemKeys = np.append(itemKeys,temp)
    i += 1

  #Setting new user IDs from string to int
  userKeysDic = {}
  userKeys = np.zeros((dataframe[userID].size))
  i = 0
  for user in dataframe[userID].unique():
    userKeysDic[user] = i
    i += 1
  i = 0
  for user in dataframe[userID]:
    userKeys[i] = userKeysDic[user]
    i += 1

  #Converting arrays from float to int 
  userKeys = userKeys.astype(int)
  itemKeys = itemKeys.astype(int)

  
  user_item = csr_matrix((dataframe[rating].values.astype(int),(userKeys,itemKeys))) #Creating sparse matrix
  user_item_matrix = user_item.toarray() #Converting sparse matrix into array
  df_user_item = pd.DataFrame(user_item_matrix,index = dataframe[userID].unique()  ,columns = dataframe[itemID].unique() ) 

  return df_user_item

In [7]:
def getUsersRatings(df,usersList): #returns df_rating containing only users that have friends 
  temp = df.copy()
  for user in list(df['userID']):
    if user not in usersList:
      temp = temp.drop(temp.loc[temp['userID'] == user].index)
  return temp

In [None]:
def genIndexColumn(df,min_index,max_index,min_column,max_column): #generates random int index and columns for a given df
  New_User_IDs = random.sample(range(min_index,max_index),df.index.size)
  New_Item_IDs = random.sample(range(min_column,max_column),df.columns.size)

  df_new = pd.DataFrame(df.to_numpy(),index = New_User_IDs,columns = New_Item_IDs)
  return df_new

In [None]:
def generateIDs(df,index,columns,min_index,max_index,min_column,max_column):
  users = df[index].unique()
  items = df[columns].unique()
  df_train = df.copy()

  New_User_IDs = random.sample(range(min_index,max_index),df[index].nunique())
  New_Item_IDs = random.sample(range(min_column,max_column),df[columns].nunique())
  i = 0
  for d in users :
    df_train[index].replace({d : New_User_IDs[i]}, inplace=True)
    i+=1

  i = 0
  for d in items :
    df_train[columns].replace({d : New_Item_IDs[i]}, inplace=True)
    i+=1
  return df_train

In [8]:
def processGmfData(df,index,column,min_user_id,max_user_id,min_item_id,max_item_id):
  
  
  df.rename(columns = {'text': 'reviewText', 'stars': 'rating', 'business_id': 'itemID', 'user_id': 'userID'}, inplace = True)
  df = generateIDs(df,index,column,min_user_id,max_user_id,min_item_id,max_item_id)
  
  df['one'] = df['rating'].apply(lambda x: 1 if x==1 else 0)
  df['two'] = df['rating'].apply(lambda x: 1 if x==2 else 0)
  df['three'] = df['rating'].apply(lambda x: 1 if x==3 else 0)
  df['four'] = df['rating'].apply(lambda x: 1 if x==4 else 0)
  df['five'] = df['rating'].apply(lambda x: 1 if x==5 else 0)
  df['six'] = df['rating'].apply(lambda x: 1 if x==6 else 0)

  return df

In [10]:
#GMF
def Create_GMF(num_users,num_items,SIZE_):

  input_userID = layers.Input(shape=[1], name='user_ID')
  input_itemID = layers.Input(shape=[1], name='item_ID')

  user_emb_GMF = layers.Embedding(num_users, SIZE_, name='user_emb_GMF')(input_userID)
  item_emb_GMF = layers.Embedding(num_items, SIZE_, name='item_emb_GMF')(input_itemID)

  u_GMF = layers.Flatten()(user_emb_GMF)
  i_GMF = layers.Flatten()(item_emb_GMF)

  dot_layer = layers.Multiply()([u_GMF, i_GMF])

  out_layer = layers.Dense(6, activation='softmax', name='output')(dot_layer)

  GMF = tf.keras.Model([input_userID, input_itemID], out_layer)
  
  return GMF

def Train_GMF(model,X_train,Y_train,nbrEpochs):
  model.compile(optimizer = 'adam',
                    loss = tf.keras.losses.categorical_crossentropy ,
                    metrics= METRICS)
  model.fit(X_train,Y_train,epochs = nbrEpochs)
  return model

def user_item_ID_lists(userIDs,itemIDs):
  i = 0
  item_s = pd.Series()
  user_s = pd.Series()
  for user in progressbar(userIDs):
    temp = pd.Series(itemIDs)
    item_s = item_s.append(temp)
    temp = []
    temp = [user for item in itemIDs]
    temp = pd.Series(temp)
    user_s = user_s.append(temp)

  return user_s,item_s

def Fill_Cf_Matrix(model,userList,itemList,userIDs,itemIDs):

  prediction = model.predict([userList,itemList],verbose = 1)
  i = 0
  row = []
  matrix = []
  print("pred done")

  while i < userList.shape[0]:
    result = np.where(prediction[i] == np.amax(prediction[i]))[0][0] + 1
    row.append(result)
    if len(row) == itemIDs.size:
      matrix.append(row)
      row = []
       
    i += 1

  matrix_arr = np.array(matrix)
  dataframe = pd.DataFrame(matrix_arr, index = userIDs, columns = itemIDs)
  
  return dataframe

def loadDataset(fileID):
  dataset = pd.read_csv(DATA[fileID])
  return dataset

def saveDataframe(df,fileID):
  file_path = PATH+'GMF_filled_'+DATA[fileID]
  df.to_csv(file_path)

def filledGMFmat(SIZE_,fileID,nbrEpochs):

  df_og = loadDataset(fileID)
  df = processGmfData(df_og,'userID','itemID',1000,2000+df_og['userID'].nunique(),60000,61000+df_og['itemID'].nunique())
  model = Create_GMF(df['userID'].max() + 1,df['itemID'].max() + 1,SIZE_)
  x_train ,x_test,y_train,y_test = train_test_split(df[['userID','itemID']],df[['one','two','three','four','five','six']],stratify=df[['one','two','three','four','five','six']])
  model_trained = Train_GMF(model,x_train,y_train,nbrEpochs)
  model_trained.evaluate(x_test,y_test)
  if save == 1:
    user_s,item_s = user_item_ID_lists(df['userID'].unique(),df['itemID'].unique())
    df_mat_filled = Fill_Cf_Matrix(model,user_s,item_s,df_og['userID'].unique(),df_og['itemID'].unique())
    saveDataframe(df_mat_filled,fileID)
  
  return df_mat_filled

def getAutoCFmodel(x_train,mid_layer_ratio,nb_layers):
  #mid_layer_ratio [0 - 1]
  layer_ratio =  mid_layer_ratio + nb_layers*0.1

  encoder_input = layers.Input(shape=(x_train.shape[1]),name='user_item')
  flat = layers.Flatten()(encoder_input)
  for i in range(nb_layers):
    hid_encoder = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(hid_encoder)
    layer_ratio -= 0.1
  hid_encoder(flat)
  encoder_output = layers.Dense(mid_layer_ratio*x_train.shape[1],activation="relu")(hid_encoder)
  for i in range(nb_layers):
    decoder_input = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(decoder_input)
    layer_ratio += 0.1
  decoder_input(encoder_output)
  decoder_output = layers.Dense(x_train.shape[1],activation="relu")(decoder_input)

  autoencoder = tf.keras.Model(inputs = encoder_input, outputs = decoder_output)

  return autoencoder

def trainAutoCF(x_train,y_train,nbrEpochs,lossF,OF,mid_layer_ratio,nb_layers,arc):
  
  if arc == 0:
    model = getAutoCFmodel(x_train,mid_layer_ratio,nb_layers)
  else:
    model = getSS_HAEmodel(x_train,mid_layer_ratio,nb_layers)
  model.compile(optimizer = OF,
                    loss = lossF,
                    metrics= METRICS)
  model.fit(x_train,y_train,epochs = nbrEpochs)
  
  return model

def evaluateAutoCF(input_fileID):



def createAutoCF(input_fileID,target_fileID,nbrEpochs,lossF,OF,mid_layer_ratio,nb_layers):

  sparseDf = loadDataset(input_fileID)
  df_mat_filled = loadDataset(target_fileID)
  df_mat = DatasetToUserItemDataFrame(sparseDf,'userID','itemID','rating')
  x_train,x_test,y_train,y_test = train_test_split(df_mat,df_mat_filled)
  model_trained = trainAutoCF(0,x_train,y_train,nbrEpochs,lossF,OF,mid_layer_ratio,nb_layers)
  model_trained.save(PATH+'autoCFmodel')

  return model_trained

def createSS_HAE(input_rating_fileID,target_rating_fileID,trust_fileID,nbrEpochs,lossF,OF,mid_layer_ratio,nb_layers)):

  sparseDf = loadDataset(input_rating_fileID)
  df_rating_filled = loadDataset(target_rating_fileID)
  df_trust_mat = loadDataset(trust_fileID)
  trust_users_list = list(df_trust_mat.index)
  df_rating_filled = getUsersRatings(df_rating_filled,trust_users_list) 
  df_rating_filled = orgDataframe(df_rating_filled,trust_users_list)

  #input & target for autoencoder training
  df_rating_mat = DatasetToUserItemDataFrame(sparseDf,'userID','itemID','rating')
  df_rating_mat_filled = orgMatDataframe(df_rating_mat_filled,trust_users_list)

  #data split
  x_rating_train,x_rating_test,y_rating_train,y_rating_test = train_test_split(df_rating_mat,df_rating_mat_filled)
  x_train,x_test = train_test_split(df_trust_mat)

  model_trained = trainAutoCF(1,[x_rating_train,x_train],[y_rating_train,x_train],nbrEpochs,lossF,OF,mid_layer_ratio,nb_layers)
  model_trained.save(PATH+'SS_HAEmodel')

  return model_trained

def getSS_HAEmodel(x_train,mid_layer_ratio,nb_layers):
  
  #mid_layer_ratio [0 - 1]
  layer_ratio =  mid_layer_ratio + nb_layers*0.1

  #Social_Autoencoder

  rating_input = layers.Input(shape=(x_train.shape[1]),name='user_item')
  social_input = layers.Input(shape=(x_train.shape[1]),name='user_user')

  flat_rating = layers.Flatten()(rating_input)
  flat_social = layers.Flatten()(social_input)

  #dropout = layers.Dropout(.2)(flat)
  SharedLayer_encoder = layers.Concatenate()([flat_rating,flat_social])
  for i in range(nb_layers):
    hid_encoder = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(hid_encoder)
    layer_ratio -= 0.1
  hid_encoder(SharedLayer_encoder)
  encoder_output = layers.Dense(mid_layer_ratio,activation="relu")(hid_encoder)
  for i in range(nb_layers):
    hid_decoder = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(hid_decoder)
    layer_ratio += 0.1
  hid_decoder(encoder_input)
  SharedLayer_decoder =  layers.Dense(df_mat_rating.shape[1]+df_mat_trust.shape[1],activation="relu")(hid_decoder)
  rating_decoded , social_decoded = tf.split(SharedLayer_decoder,[df_mat_rating.shape[1],df_mat_trust.shape[1]],1)

  rating_output = layers.Dense(df_mat_rating.shape[1],activation="relu",name='rating_output')(rating_decoded)
  social_output = layers.Dense(df_mat_trust.shape[1],activation="relu",name='social_output')(social_decoded)

  autoencoder = tf.keras.Model(inputs = [rating_input,social_input], outputs = [rating_output,social_output])

  return autoencoder
  

In [11]:
def orgDataframe(df_to_org,org_list):

  df_org = pd.DataFrame(columns = ['userID','itemID','rating','reviewText'])
  for user in org_list:
    df_temp = df_to_org[(df_to_org['userID'] == user )]
    df_org = pd.concat([df_org,df_temp])

  return df_org

def orgMatDataframe(df_to_org,org_list):
  
  df_org = pd.DataFrame(index = list(df_to_org.index),columns = list(df_to_org.columns))
  for user in org_list:
    df_org.loc[user] = list(df_to_org.loc[user])
  
  return df_org


In [30]:
df_test = pd.DataFrame(index = ['aa','bb','cc'], columns=['aaa','zeze','gdfg'])
list(df_test.loc['aa'])


[nan, nan, nan]

In [None]:
def getItemsScore(userID,modelID,fileID):
  
  listeItemScore = pd.DataFrame()
  
  if modelID == 0:
    listItemScore = predSeCF(userID,modelID,fileID)   #CF
  elif modelID == 1:
    listItemScore = predSeCF(userID,modelID,fileID)   #CF + Sentiment
  elif modelID == 2:
    listItemScore = predSSeCF(userID,modelID,fileID)  #CF + Social
  elif modelID == 3:
    listItemScore = predSSeCF(userID,modelID,fileID)  #CF + Sentiment + Social

  return listeItemScore

In [None]:
def loadDataFrame(df):
  df_loaded = pd.read_csv(DATA[df])
  return  df_loaded

In [None]:
def getModel(modelID):
  model = tf.keras.models.load_model(MODEL_PATH[modelID])
  return model

In [None]:
def predSeCF(userID,modelID,fileID):
  
  model = getModel(modelID)
  df_user_item_mat = pd.read_csv(DATA[fileID])
  SparseScoresVec = df_user_item_mat.loc[userID].to_numpy()
  listItemScores = df_user_item_mat.loc[userID].to_frame()
  listItemScores = listItemScores.reset_index()
  listItemScores.set_axis(['itemID','score'],axis='columns',inplace=True)
  PredScoresVec = model.predict(SparseScoresVec)
  listItemScores.replace(SparseScoresVec,PredScoresVec,inplace = True)

  return listItemScores

In [None]:
def predSSeCF(userID,modelID,ratings_fileID,trust_fileID):

  model = getModel(modelID)
  df_user_item_mat = loadDataset(fileID)
  df_user_user_mat = pd.read_csv(trust_fileID)
  SparseScoresVec = df_user_item_mat.loc[userID].to_numpy()
  SparseTrustVec = df_user_user_mat.loc[userID].to_numpy()
  listItemScores = df_user_item_mat.loc[userID].to_frame()
  listItemScores = listItemScores.reset_index()
  listItemScores.set_axis(['itemID','score'],axis='columns',inplace=True)
  PredScoresVec,SparseTrustVec = model.predict([SparseScoresVec,SparseTrustVec])
  listItemScores.replace(SparseScoresVec,PredScoresVec,inplace = True)

  return listItemScores

In [12]:
df_train = df_rating[:5000]
df_train['userID'].nunique()

4843

In [13]:
df_train['itemID'].nunique()

2663

In [12]:
df_user_user.set_index(['Unnamed: 0'],inplace=True)

In [18]:
df_train_filtered = getUsersRatings(df_train,list(df_user_user.index))

In [19]:
df_train_filtered.head()

Unnamed: 0,userID,itemID,rating,reviewText
13,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,Good food--loved the gnocchi with marinara\nth...
17,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4.0,The hubby and I have been here on multiple occ...
34,3MpDvy5gEdsbZh9-p92dHg,8QnuWGVNBhzyYXGSeRdi4g,4.0,After my ROTD yesterday of a different Sweet ...
40,yobeeTUBfaTBcnk26mXNuA,hKameFsaXh9g8WQbv593UA,4.0,Food was good- atmosphere/decor is like a fish...
49,ZGjgfSvjQK886kiTzLwfLQ,EtKSTHV5Qx_Q7Aur9o4kQQ,5.0,"On a scale of one to things that are awesome, ..."


In [20]:
df_train_filtered = orgDataframe(df_train_filtered,df_user_user)

In [21]:
df_train_filtered.head()

Unnamed: 0,userID,itemID,rating,reviewText
3190,-0KrCHEsOcjJ6N4k_k1A9A,gEJ3a8-m6VB827e6GHromw,4.0,This calzone shop offers an array of calzones ...
605,-G7Zkl1wIWBBmD0KRy_sCw,IbndcMURguByburM72o3SA,5.0,"Just finished a State Street Kitchen, Grilled ..."
2450,-NbeVN5tnwdyYAvdNkKMjw,LUXRw-mr9emGL2gw4otvVA,4.0,I love driving on the Benjamin Franklin Bridge...
1647,-QmEKJ_CzZnT9biZHddfZQ,Fc_8eLlcq5yy4DzrEeqEaQ,3.0,The food is probably one of the more mediocre ...
968,-THLGnsYKu3yQAsy_tt1fw,qs9d6iGos9UO4BzhVmJ9UA,5.0,They just opened today and WOW! Aren't we luck...


In [None]:
mat = []
for user in df_user_user.index:
  mat.append(list(df_mat.loc[user]))


df_train_r = pd.DataFrame(mat,index= df_user_user.index,columns = df_mat.columns)

In [None]:
df_train = processGmfData(df_train_filtered,'userID','itemID',5000,9999,20000,30000)

In [None]:
df_train.head()

In [None]:
df_mat = DatasetToUserItemDataFrame(df_train_filtered,'userID','itemID','rating')

In [None]:
df_mat.head()

Unnamed: 0,gEJ3a8-m6VB827e6GHromw,IbndcMURguByburM72o3SA,LUXRw-mr9emGL2gw4otvVA,Fc_8eLlcq5yy4DzrEeqEaQ,qs9d6iGos9UO4BzhVmJ9UA,McQo2QJGt1BI9oPu6KLDjg,Dm8r2MmXeXHRaJyPUOQv-Q,qKswqEe67vJJSW05W7Rx-A,mm4gSCCJXuAZFWDkESMjmw,IUgQmteI-mbt_qjL2ZENKA,...,9kjDMcm9zesdk5b-MQp0iw,KuLi91b-N_0XY_3IA72RJQ,0Ga7YzSAIVZ8jrWwGE9m6Q,fBCv5Euudl9VieR870gwNg,U0zDLXmqyUwRnPrF8KCepQ,LGi-u2zVoSciM0F_2nSg8g,hC95TvOugXXRF7JEOPZHQQ,sVtPuuA57YjHAHVmN2gzpA,tz3Vj8nR3CKe3nZrZgJxVw,iapJoe3OzxeZqOGUJxnG3w
-0KrCHEsOcjJ6N4k_k1A9A,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-G7Zkl1wIWBBmD0KRy_sCw,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-NbeVN5tnwdyYAvdNkKMjw,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-QmEKJ_CzZnT9biZHddfZQ,0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-THLGnsYKu3yQAsy_tt1fw,0,0,0,5,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_item_scores = df_mat.loc['-0KrCHEsOcjJ6N4k_k1A9A'].to_frame()


In [None]:
arr1 = df_item_scores.to_numpy()

In [None]:
df_item_scores = df_item_scores.reset_index()
df_item_scores.set_axis(['itemID','score'],axis='columns',inplace=True)
df_item_scores.head()

Unnamed: 0,itemID,score
0,gEJ3a8-m6VB827e6GHromw,4
1,IbndcMURguByburM72o3SA,0
2,LUXRw-mr9emGL2gw4otvVA,0
3,Fc_8eLlcq5yy4DzrEeqEaQ,0
4,qs9d6iGos9UO4BzhVmJ9UA,0


In [None]:
df_item_ = df_mat.loc['-G7Zkl1wIWBBmD0KRy_sCw']
df_item_scores.replace(arr1,df_item_.to_numpy(),inplace = True)

In [None]:
df_item_scores

In [None]:
df_item_scores

Unnamed: 0_level_0,score
index,Unnamed: 1_level_1
0,4
1,0
2,0
3,0
4,0
...,...
555,0
556,0
557,0
558,0


In [None]:
GMF = Create_GMF(df_train['userID'].max() + 1,df_train['itemID'].max() + 1,64)

In [None]:
df_train_sample = df_train
X_train, X_test,y_train,y_test= train_test_split(df_train_sample[['userID','itemID']],df_train_sample[['one','two','three','four','five','six']],stratify=df_train_sample[['one','two','three','four','five','six']])
#X_train, X_test,y_train,y_test= train_test_split(df_train[['User_ID','Item_ID']],df_train['rating'],stratify=df_train['rating'])

In [None]:
GMF_trained = Train_GMF(GMF,[X_train['userID'],X_train['itemID']],y_train,5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
GMF_trained.evaluate([X_test['userID'],X_test['itemID']],y_test)



[1.4921581745147705, 0.4197530746459961]

In [None]:
New_User_IDs = df_train_filtered['userID'].unique()
New_Item_IDs = df_train_filtered['itemID'].unique()
New_Item_IDs.size * New_User_IDs.size

328160

In [None]:
user_s,item_s = user_item_ID_lists( df_train['userID'].unique(),df_train['itemID'].unique())

In [None]:
df_mat_filled = Fill_Cf_Matrix(GMF_trained,user_s,item_s,New_User_IDs,New_Item_IDs)

In [None]:
df_mat_filled.head()

In [None]:
df_mat.head()

In [None]:
df_mat_rating =df_mat_filled

In [None]:
df_mat_trust = df_user_user

In [None]:
df_mat_trust.head()

Unnamed: 0_level_0,-0KrCHEsOcjJ6N4k_k1A9A,-G7Zkl1wIWBBmD0KRy_sCw,-NbeVN5tnwdyYAvdNkKMjw,-QmEKJ_CzZnT9biZHddfZQ,-THLGnsYKu3yQAsy_tt1fw,-ZHlPAvlVdgtiu6DiCq7Yg,-hKniZN2OdshWLHYuj21jQ,-iC2-qwz19U7Xr4afUB9sg,-mXobpJ3z3X6nMriCklbog,-qoyKSF2G3PkR_7XNoJfpQ,...,zBWSORvlWDmwtcx6627qWQ,zHvS1F_6wOPSLUk6vpDEGA,zK-HYsRHsUrbhvWv6pq0Ew,zTwwciNRMedBvUS3-_8h6g,zfD3xhVNkGJs-AOOSslqtQ,zmLUS4Tqn-qzkg3ec6U9eg,zp-XjxYQPY1w8Le6GzI25Q,ztVQFPr9khc_TjsBny-3rA,zwIhFlA84tTLvSSjHc6IgA,zxuxd6Hz2tKcpgZ71dYEcw
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0KrCHEsOcjJ6N4k_k1A9A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-G7Zkl1wIWBBmD0KRy_sCw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-NbeVN5tnwdyYAvdNkKMjw,1,1,0,1,0,0,1,1,1,0,...,1,1,1,1,1,1,1,0,1,0
-QmEKJ_CzZnT9biZHddfZQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-THLGnsYKu3yQAsy_tt1fw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_mat_rating.head()

Unnamed: 0,gEJ3a8-m6VB827e6GHromw,IbndcMURguByburM72o3SA,LUXRw-mr9emGL2gw4otvVA,Fc_8eLlcq5yy4DzrEeqEaQ,qs9d6iGos9UO4BzhVmJ9UA,McQo2QJGt1BI9oPu6KLDjg,Dm8r2MmXeXHRaJyPUOQv-Q,qKswqEe67vJJSW05W7Rx-A,mm4gSCCJXuAZFWDkESMjmw,IUgQmteI-mbt_qjL2ZENKA,...,9kjDMcm9zesdk5b-MQp0iw,KuLi91b-N_0XY_3IA72RJQ,0Ga7YzSAIVZ8jrWwGE9m6Q,fBCv5Euudl9VieR870gwNg,U0zDLXmqyUwRnPrF8KCepQ,LGi-u2zVoSciM0F_2nSg8g,hC95TvOugXXRF7JEOPZHQQ,sVtPuuA57YjHAHVmN2gzpA,tz3Vj8nR3CKe3nZrZgJxVw,iapJoe3OzxeZqOGUJxnG3w
-0KrCHEsOcjJ6N4k_k1A9A,4,3,3,4,1,2,3,4,3,4,...,5,4,4,4,4,2,3,3,4,4
-G7Zkl1wIWBBmD0KRy_sCw,2,5,4,4,3,4,4,4,4,4,...,1,4,4,4,4,5,5,5,1,3
-NbeVN5tnwdyYAvdNkKMjw,2,5,4,4,2,5,5,4,4,4,...,5,4,3,4,4,5,2,5,1,4
-QmEKJ_CzZnT9biZHddfZQ,5,4,4,4,4,5,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
-THLGnsYKu3yQAsy_tt1fw,3,3,3,4,5,4,3,4,5,4,...,5,4,5,5,4,3,1,5,4,5


In [None]:
def to_binary(x):
  if (x > 0):
    x = 1
  return x

In [None]:
def df_to_binary(df):
  for col in df.columns:
    df[col] = df[col].apply(to_binary)
  return df

In [None]:
#Social_Autoencoder

rating_input = layers.Input(shape=(df_mat_rating.shape[1]),name='user_item')
social_input = layers.Input(shape=(df_mat_trust.shape[1]),name='user_user')

flat_rating = layers.Flatten()(rating_input)
flat_social = layers.Flatten()(social_input)

#dropout = layers.Dropout(.2)(flat)
SharedLayer_encoder = layers.Concatenate()([flat_rating,flat_social])
hid_encoder = layers.Dense(256,activation="relu")(SharedLayer_encoder)
# hid_encoder1 = layers.Dense(256,activation="relu")(hid_encoder)
# hid_encoder2 = layers.Dense(128,activation="relu")(hid_encoder1)
# hid_encoder3 = layers.Dense(10,activation="relu")(hid_encoder2)
encoder_output = layers.Dense(128,activation="relu")(hid_encoder)

hid_decoder = layers.Dense(256,activation="relu")(encoder_output)
# decoder_hidden1 = layers.Dense(256,activation="relu")(decoder_input)
# decoder_hidden2 = layers.Dense(512,activation="relu")(decoder_hidden1)
# decoder_hidden3 = layers.Dense(25,activation="relu")(decoder_hidden2)
SharedLayer_decoder =  layers.Dense(df_mat_rating.shape[1]+df_mat_trust.shape[1],activation="relu")(hid_decoder)
rating_decoded , social_decoded = tf.split(SharedLayer_decoder,[df_mat_rating.shape[1],df_mat_trust.shape[1]],1)

rating_output = layers.Dense(df_mat_rating.shape[1],activation="relu",name='rating_output')(rating_decoded)
social_output = layers.Dense(df_mat_trust.shape[1],activation="relu",name='social_output')(social_decoded)

autoencoder = tf.keras.Model(inputs = [rating_input,social_input], outputs = [rating_output,social_output])

In [None]:
def rmse (y_true,y_pred):
  y_pred = tf.cast(y_pred, tf.float32)
  y_true = tf.cast(y_true, tf.float32)
  rmse = y_true - y_pred
  rmse = tf.square(rmse)
  rmse = tf.math.reduce_mean(rmse)
  rmse = tf.math.sqrt(rmse)
  return rmse

opt = tf.keras.optimizers.Adam(learning_rate=0.00001)

In [None]:
autoencoder.compile(optimizer ='adam',
                    loss = tf.keras.losses.MeanAbsoluteError(),
                    metrics=['accuracy'])

In [None]:
autoencoder.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_item (InputLayer)         [(None, 560)]        0           []                               
                                                                                                  
 user_user (InputLayer)         [(None, 586)]        0           []                               
                                                                                                  
 flatten_10 (Flatten)           (None, 560)          0           ['user_item[0][0]']              
                                                                                                  
 flatten_11 (Flatten)           (None, 586)          0           ['user_user[0][0]']              
                                                                                            

In [None]:
X_rat_train,X_rat_test,y_rat_train,y_rat_test = train_test_split(df_mat,df_mat_rating)
X_soc_train,X_soc_test,y_soc_train,y_soc_test = train_test_split(df_mat_trust,df_mat_trust)

In [None]:
y_soc_train

In [None]:
autoencoder.fit([X_rat_train,X_soc_train],[y_rat_train,y_soc_train], epochs = 55)

In [None]:
autoencoder.evaluate([X_rat_test,X_soc_test],[y_rat_test,y_soc_test])



[0.8006726503372192,
 0.7313845157623291,
 0.06928811967372894,
 0.0,
 0.6870748400688171]

In [None]:
df_mat

In [None]:
mat_pred = autoencoder.predict(df_mat)

In [None]:
mat_pred

In [None]:
df_mat_pred = pd.DataFrame(mat_pred, index = df_mat_sample.index, columns = df_mat_sample.columns)

In [None]:
df_mat_pred.max()

189722    6.472934
122571    6.492408
100211    6.818763
101960    6.571887
105337    6.085056
            ...   
137594    5.982568
198330    7.050071
136306    6.202019
197491    7.210152
124494    5.497391
Length: 1369, dtype: float32

In [None]:
scaler = preprocessing.MinMaxScaler(feature_range=(1, 5))
d = scaler.fit_transform(df_mat_pred)
scaled_df = pd.DataFrame(d,index = df_mat_pred.index,columns = df_mat_pred.columns)
scaled_df

In [None]:
for col in df_mat_pred.columns:
 df_mat_pred[col] = df_mat_pred[col].apply(lambda x: 5 if x> 5.5 else x)

In [None]:
df_mat_pred

In [None]:
res = np.where(df_mat_pred.iloc[:,:].values == 1 )
res[0].size