<a href="https://colab.research.google.com/github/AmineSdk/RecommenderSystem/blob/main/Notebook/CF_Social_Autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download CellPhones -f CellPhonesRating.csv
! kaggle datasets download Aamazon-Cellphones-35k-GMF-filled-mat -f Aamazon_Cellphones_35k_GMF_filled_mat.csv

mkdir: cannot create directory ‘/root/.kaggle’: File exists
CellPhonesRating.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
Aamazon_Cellphones_35k_GMF_filled_mat.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
from sklearn import preprocessing
from scipy.sparse import csr_matrix
import tensorflow as tf 
import numpy as np
import tensorflow_hub as hub
import pandas as pd
import random
from keras import datasets, layers, models
from sklearn.model_selection import train_test_split
from progressbar import progressbar

In [3]:
def amazonPreprocess(df):
  
  df = df[:35000]
  df = df.dropna(how='any',axis=0)
  df.rename(columns = {'reviewerID':'userID', 'productID' : 'itemID'},inplace = True)
  df.drop_duplicates(subset =['itemID', 'userID'] , keep = 'first' , inplace = True)
  print("DATASET PREPROCESSED")

  return df


In [4]:
def yelpPreprocess(df):

  df = df.dropna(how='any',axis=0)
  df.drop(columns=['funny', 'cool','review_id','useful','date'],inplace=True)
  df.df.rename(columns = {'reviewer_id':'userID', 'business_id':'itemID','stars':'rating'},inplace = True)
  df.drop_duplicates(subset =['itemID', 'userID'] , keep = 'first' , inplace = True)
  print("DATASET PREPROCESSED")

  return df


In [5]:
def DatasetToUserItemDataFrame(dataframe,userID,itemID,rating):
  #Setting new item IDs from string to int 
  itemKeys = [] 
  i = 0
  for item in dataframe[itemID].value_counts(sort=False):
    temp = np.full((item),i)
    itemKeys = np.append(itemKeys,temp)
    i += 1

  #Setting new user IDs from string to int
  userKeysDic = {}
  userKeys = np.zeros((dataframe[userID].size))
  i = 0
  for user in dataframe[userID].unique():
    userKeysDic[user] = i
    i += 1
  i = 0
  for user in dataframe[userID]:
    userKeys[i] = userKeysDic[user]
    i += 1

  #Converting arrays from float to int 
  userKeys = userKeys.astype(int)
  itemKeys = itemKeys.astype(int)

  
  user_item = csr_matrix((dataframe[rating].values.astype(int),(userKeys,itemKeys))) #Creating sparse matrix
  user_item_matrix = user_item.toarray() #Converting sparse matrix into array
  df_user_item = pd.DataFrame(user_item_matrix,index = dataframe[userID].unique()  ,columns = dataframe[itemID].unique() ) 

  return df_user_item

def getUsersRatings(df,usersList): #returns df_rating containing only users that have friends 
  temp = df.copy()
  for user in list(df['userID']):
    if user not in usersList:
      temp = temp.drop(temp.loc[temp['userID'] == user].index)
  return temp

def genIndexColumn(df,min_index,max_index,min_column,max_column): #generates random int index and columns for a given df
  New_User_IDs = random.sample(range(min_index,max_index),df.index.size)
  New_Item_IDs = random.sample(range(min_column,max_column),df.columns.size)

  df_new = pd.DataFrame(df.to_numpy(),index = New_User_IDs,columns = New_Item_IDs)
  return df_new

def generateIDs(df,index,columns,min_index,max_index,min_column,max_column):
  users = df[index].unique()
  items = df[columns].unique()
  df_train = df.copy()

  New_User_IDs = random.sample(range(min_index,max_index),df[index].nunique())
  New_Item_IDs = random.sample(range(min_column,max_column),df[columns].nunique())
  i = 0
  for d in progressbar(users) :
    df_train[index].replace({d : New_User_IDs[i]}, inplace=True)
    i+=1

  i = 0
  for d in progressbar(items) :
    df_train[columns].replace({d : New_Item_IDs[i]}, inplace=True)
    i+=1
  return df_train

def processGmfData(df,index,column,min_user_id,max_user_id,min_item_id,max_item_id):
  
  
  #df.rename(columns = {'text': 'reviewText', 'stars': 'rating', 'business_id': 'itemID', 'user_id': 'userID'}, inplace = True)
  df.rename(columns = {'reviewerID':'userID', 'productID' : 'itemID'},inplace = True)
  df = generateIDs(df,index,column,min_user_id,max_user_id,min_item_id,max_item_id)
  print("== PREPROCESSING DATA ...")
  df = df.dropna(how='any',axis=0)
  df.drop_duplicates(subset =['itemID', 'userID'] , keep = 'first' , inplace = True)

  df['one'] = df['rating'].apply(lambda x: 1 if x==1 else 0)
  df['two'] = df['rating'].apply(lambda x: 1 if x==2 else 0)
  df['three'] = df['rating'].apply(lambda x: 1 if x==3 else 0)
  df['four'] = df['rating'].apply(lambda x: 1 if x==4 else 0)
  df['five'] = df['rating'].apply(lambda x: 1 if x==5 else 0)
  #df['six'] = df['rating'].apply(lambda x: 1 if x==6 else 0)
  print("== DATA PREPROCESSED ==")

  return df

In [6]:
#GMF
def getGMFmodel(num_users,num_items,SIZE_):

  input_userID = layers.Input(shape=[1], name='user_ID')
  input_itemID = layers.Input(shape=[1], name='item_ID')

  user_emb_GMF = layers.Embedding(num_users, SIZE_, name='user_emb_GMF')(input_userID)
  item_emb_GMF = layers.Embedding(num_items, SIZE_, name='item_emb_GMF')(input_itemID)

  u_GMF = layers.Flatten()(user_emb_GMF)
  i_GMF = layers.Flatten()(item_emb_GMF)

  dot_layer = layers.Multiply()([u_GMF, i_GMF])

  out_layer = layers.Dense(5, activation='softmax', name='output')(dot_layer)

  GMF = tf.keras.Model([input_userID, input_itemID], out_layer)
  
  return GMF


def user_item_ID_lists(userIDs,itemIDs):
  i = 0
  item_s = pd.Series()
  user_s = pd.Series()

  if userIDs.size > itemIDs.size:
    for item in progressbar(itemIDs):
      temp = pd.Series(userIDs)
      user_s = user_s.append(temp)
      temp = []
      temp = [item for user in userIDs]
      temp = pd.Series(temp)
      item_s = item_s.append(temp)
  else:
    for user in progressbar(userIDs):
      temp = pd.Series(itemIDs)
      item_s = item_s.append(temp)
      temp = []
      temp = [user for item in itemIDs]
      temp = pd.Series(temp)
      user_s = user_s.append(temp)

  return user_s,item_s


def Fill_Cf_Matrix(model,userList,itemList,userIDs,itemIDs):
  rowLen = userIDs.size if userIDs.size > itemIDs.size else itemIDs.size

  prediction = model.predict([userList,itemList],verbose = 1)
  i = 0
  row = []
  matrix = []
  print("pred done")
  
  for i in progressbar(range(userList.shape[0])):
    result = np.where(prediction[i] == np.amax(prediction[i]))[0][0] + 1
    row.append(result)
    if len(row) == rowLen:
      matrix.append(row)
      row = []
       
    i += 1

  matrix_arr = np.array(matrix)
  if(userIDs.size > itemIDs.size):
    matrix_arr = matrix_arr.transpose()
  dataframe = pd.DataFrame(matrix_arr, index = userIDs, columns = itemIDs)
  
  return dataframe

def loadDataset(fileID):
  dataset = pd.read_csv(DATA[fileID])
  print("== FILE LOADED ==")
  return dataset

def saveDataframe(df,fileID):
  file_path = PATH+'GMF_filled_'+DATA[fileID]
  df.to_csv(file_path)

def genGMFmat(df,df_test,modelID,fileID,nbrE,lossF,OF,emb,filter=None,cb=None):

  # df_og = loadDataset(fileID)
  # df = processGmfData(df_og,'userID','itemID',1000,2000+df_og['reviewerID'].nunique(),60000,61000+df_og['productID'].nunique())
  
  x_train ,x_test,y_train,y_test = train_test_split(df_test[['userID','itemID']],df_test[['one','two','three','four','five']],test_size=0.2,stratify=df_test[['one','two','three','four','five']])
  model_trained = trainModel(modelID,nbrE,lossF,OF,[x_train['userID'],x_train['itemID']],y_train,maxUserID=df_test['userID'].max() + 1,maxItemID =df_test['itemID'].max() + 1,embed_size=emb)
  model_trained.evaluate([x_test['userID'],x_test['itemID']],y_test)

  #df_mat_filled = FillSparseMat(model_trained,df_test,df)
  
  return model_trained

def FillSparseMat(model,df_prepro,df_og):

  user_s,item_s = user_item_ID_lists(df_prepro['userID'].unique(),df_prepro['itemID'].unique())
  #df_mat_filled = Fill_Cf_Matrix(model,user_s,item_s,df_og['userID'].unique(),df_og['itemID'].unique())

  return user_s,item_s


def trainModel(modelID,nbrEpochs,lossF,OF,x_train ,y_train ,mid_layer_ratio=None,nb_layers=None,maxUserID = None,maxItemID = None,embed_size = None,filter_size = None,bs = 32):
  
  if modelID =='BLCNN':
    model = getBLCNNmodel(embed_size,filter_size)
  if modelID == 'GMF':
    model = getGMFmodel(maxUserID,maxItemID,embed_size)
  elif modelID == 'S-AutoCF':
    model = getAutoCFmodel(x_train,mid_layer_ratio,nb_layers)
    print(model.summary())
  elif modelID == 'SS-AutoCF':
    model = getSS_HAEmodel(x_train,mid_layer_ratio,nb_layers)

  model.compile(optimizer = OF,
                    loss = lossF,
                    metrics= METRICS)
  model.fit(x_train,y_train,epochs = nbrEpochs,batch_size = bs,validation_split=0.14)
  
  return model

def getAutoCFmodel(x_train,mid_layer_ratio,nb_layers):
  #mid_layer_ratio [0 - 1]
  layer_ratio =  mid_layer_ratio + nb_layers*0.1

  encoder_input = layers.Input(shape=(x_train.shape[1]),name='user_item')
  flat = layers.Flatten()(encoder_input)
  hid_encoder = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(flat)
  for i in range(nb_layers-1):
    layer_ratio -= 0.1
    hid_encoder = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(hid_encoder)
    
  encoder_output = layers.Dense(mid_layer_ratio*x_train.shape[1],activation="relu")(hid_encoder)
  decoder_input = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(encoder_output)
  for i in range(nb_layers-1):
    layer_ratio += 0.1
    decoder_input = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(decoder_input)
    
  decoder_output = layers.Dense(x_train.shape[1],activation="relu")(decoder_input)

  autoencoder = tf.keras.Model(inputs = encoder_input, outputs = decoder_output)

  return autoencoder

def getSS_HAEmodel(x_train,mid_layer_ratio,nb_layers):
  
  #mid_layer_ratio [0 - 1]
  layer_ratio =  mid_layer_ratio + nb_layers*0.1

  #Social_Autoencoder

  rating_input = layers.Input(shape=(x_train.shape[1]),name='user_item')
  social_input = layers.Input(shape=(x_train.shape[1]),name='user_user')

  flat_rating = layers.Flatten()(rating_input)
  flat_social = layers.Flatten()(social_input)

  #dropout = layers.Dropout(.2)(flat)
  SharedLayer_encoder = layers.Concatenate()([flat_rating,flat_social])
  for i in range(nb_layers):
    hid_encoder = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(hid_encoder)
    layer_ratio -= 0.1
  hid_encoder(SharedLayer_encoder)
  encoder_output = layers.Dense(mid_layer_ratio,activation="relu")(hid_encoder)
  for i in range(nb_layers):
    hid_decoder = layers.Dense(layer_ratio*x_train.shape[1],activation="relu")(hid_decoder)
    layer_ratio += 0.1
  hid_decoder(encoder_output)
  SharedLayer_decoder =  layers.Dense(df_mat_rating.shape[1]+df_mat_trust.shape[1],activation="relu")(hid_decoder)
  rating_decoded , social_decoded = tf.split(SharedLayer_decoder,[df_mat_rating.shape[1],df_mat_trust.shape[1]],1)

  rating_output = layers.Dense(df_mat_rating.shape[1],activation="relu",name='rating_output')(rating_decoded)
  social_output = layers.Dense(df_mat_trust.shape[1],activation="relu",name='social_output')(social_decoded)

  autoencoder = tf.keras.Model(inputs = [rating_input,social_input], outputs = [rating_output,social_output])

  return autoencoder

def createAutoCF(modelID,input_fileID,target_fileID,nbrEpochs,lossF,OF,mid_layer_ratio,nb_layers,bs):

  sparseDf = loadDataset(input_fileID)
  sparseDf = amazonPreprocess(sparseDf)
  df_mat_filled = loadDataset(target_fileID)
  df_mat_filled = df_mat_filled.set_index('Unnamed: 0')
  df_mat = DatasetToUserItemDataFrame(sparseDf,'userID','itemID','rating')
  x_train,x_test,y_train,y_test = train_test_split(df_mat,df_mat_filled,test_size = 0.2)
  model_trained = trainModel(modelID,nbrEpochs,lossF,OF,x_train,y_train,mid_layer_ratio,nb_layers,bs = bs)

  return model_trained , x_test, y_test

def createSS_HAE(modelID,input_rating_fileID,target_rating_fileID,trust_fileID,nbrEpochs,lossF,OF,mid_layer_ratio,nb_layers):

  sparseDf = loadDataset(input_rating_fileID)
  df_rating_filled = loadDataset(target_rating_fileID)
  df_trust_mat = loadDataset(trust_fileID)
  trust_users_list = list(df_trust_mat.index)
  df_rating_filled = getUsersRatings(df_rating_filled,trust_users_list) 
  df_rating_filled = orgDataframe(df_rating_filled,trust_users_list)

  #input & target for autoencoder training
  df_rating_mat = DatasetToUserItemDataFrame(sparseDf,'userID','itemID','rating')
  df_rating_mat_filled = orgMatDataframe(df_rating_mat_filled,trust_users_list)

  #data split
  x_rating_train,x_rating_test,y_rating_train,y_rating_test = train_test_split(df_rating_mat,df_rating_mat_filled)
  x_train,x_test = train_test_split(df_trust_mat)

  model_trained = trainModel(modelID,nbrEpochs,lossF,OF,[x_rating_train,x_train],[y_rating_train,x_train],mid_layer_ratio,nb_layers)

  return model_trained , [x_rating_test,y_rating_test] , x_test

def evaluateModel(model,x_test,y_test):
  model.evaluate(x_test,y_test) 

def orgDataframe(df_to_org,org_list):

  df_org = pd.DataFrame(columns = ['userID','itemID','rating','reviewText'])
  for user in org_list:
    df_temp = df_to_org[(df_to_org['userID'] == user )]
    df_org = pd.concat([df_org,df_temp])

  return df_org

def orgMatDataframe(df_to_org,org_list):
  
  df_org = pd.DataFrame(index = list(df_to_org.index),columns = list(df_to_org.columns))
  for user in org_list:
    df_org.loc[user] = list(df_to_org.loc[user])
  
  return df_org

def getItemsScore(userID,modelID,fileID):
  
  listeItemScore = pd.DataFrame()
  
  if modelID == 0:
    listItemScore = predSeCF(userID,modelID,fileID)   #CF
  elif modelID == 1:
    listItemScore = predSeCF(userID,modelID,fileID)   #CF + Sentiment
  elif modelID == 2:
    listItemScore = predSSeCF(userID,modelID,fileID)  #CF + Social
  elif modelID == 3:
    listItemScore = predSSeCF(userID,modelID,fileID)  #CF + Sentiment + Social

  return listeItemScore 

def getModel(modelID):
  model = tf.keras.models.load_model(MODEL_PATH[modelID])
  return model

def loadDataFrame(df):
  df_loaded = pd.read_csv(DATA[df])
  return  df_loaded

def predSeCF(userID,modelID,fileID):
  
  model = getModel(modelID)
  df_user_item_mat = pd.read_csv(DATA[fileID])
  SparseScoresVec = df_user_item_mat.loc[userID].to_numpy()
  listItemScores = df_user_item_mat.loc[userID].to_frame()
  listItemScores = listItemScores.reset_index()
  listItemScores.set_axis(['itemID','score'],axis='columns',inplace=True)
  PredScoresVec = model.predict(SparseScoresVec)
  listItemScores.replace(SparseScoresVec,PredScoresVec,inplace = True)

  return listItemScores

def predSSeCF(userID,modelID,ratings_fileID,trust_fileID):

  model = getModel(modelID)
  df_user_item_mat = loadDataset(fileID)
  df_user_user_mat = pd.read_csv(trust_fileID)
  SparseScoresVec = df_user_item_mat.loc[userID].to_numpy()
  SparseTrustVec = df_user_user_mat.loc[userID].to_numpy()
  listItemScores = df_user_item_mat.loc[userID].to_frame()
  listItemScores = listItemScores.reset_index()
  listItemScores.set_axis(['itemID','score'],axis='columns',inplace=True)
  PredScoresVec,SparseTrustVec = model.predict([SparseScoresVec,SparseTrustVec])
  listItemScores.replace(SparseScoresVec,PredScoresVec,inplace = True)

  return listItemScores
  

In [7]:
def datapreprocess(df_sample):
  
  print("== PREPROCESSING DATA ...")
  df_sample = generateIDs(df_sample,'userID','itemID',1000,2000+df_sample['userID'].nunique(),60000,61000+df_sample['itemID'].nunique())
  
  df_sample['one'] = df_sample['rating'].apply(lambda x: 1 if x==1 else 0)
  df_sample['two'] = df_sample['rating'].apply(lambda x: 1 if x==2 else 0)
  df_sample['three'] = df_sample['rating'].apply(lambda x: 1 if x==3 else 0)
  df_sample['four'] = df_sample['rating'].apply(lambda x: 1 if x==4 else 0)
  df_sample['five'] = df_sample['rating'].apply(lambda x: 1 if x==5 else 0)
  #df['six'] = df['rating'].apply(lambda x: 1 if x==6 else 0)
  print("== DATA PREPROCESSED ==")

  return df_sample

In [11]:
DATA = ['/content/CellPhonesRating.csv.zip','/content/Aamazon_Cellphones_35k_GMF_filled_mat.csv.zip','yelp_user_mat']
MODEL_PATH = []
PATH = ''
rmse = tf.keras.metrics.RootMeanSquaredError()
precision = tf.keras.metrics.Precision()
METRICS = ['accuracy','mae',rmse]

In [None]:
model,x_test,y_test = createAutoCF('S-AutoCF',0,1,20,tf.keras.losses.MeanSquaredError(),'adam',0.4,1,128)

In [20]:
evaluateModel(model,x_test,y_test)



In [None]:
df_test = datapreprocess(df_sample)

In [None]:
model = genGMFmat(df_sample,df_test,'GMF',0,5,tf.keras.losses.CategoricalCrossentropy(),'adam',256)

In [None]:
user_s,item_s = FillSparseMat(model,df_test,df_sample)

In [None]:
df_mat_filled = Fill_Cf_Matrix(model,user_s,item_s,df_sample['userID'].unique(),df_sample['itemID'].unique())

In [None]:
df_mat_filled.to_csv("/content/Yelp_shopping_GMF_filled_mat.csv")

In [None]:
from google.colab import files
files.download("/content/Yelp_shopping_GMF_filled_mat.csv")