In [1]:
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 5.2 MB/s 
Collecting tensorflow<2.10,>=2.9.0
  Downloading tensorflow-2.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 6.1 kB/s 
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Downloading tensorflow_estimator-2.9.0-py2.py3-none-any.whl (438 kB)
[K     |████████████████████████████████| 438 kB 68.1 MB/s 
Collecting flatbuffers<2,>=1.12
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting keras<2.10.0,>=2.9.0rc0
  Downloading keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 55.6 MB/s 
Collecting tensorboard<2.10,>=2.9
  Downloading tensorboard-2.9.1-py3-none-any.whl (5.8 

In [2]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d aminesedkaoui/epinions-ratings-500k
! kaggle datasets download -d aminesedkaoui/epinions-user-item-matrix-75k

Downloading epinions-ratings-500k.zip to /content
  0% 0.00/3.25M [00:00<?, ?B/s]
100% 3.25M/3.25M [00:00<00:00, 266MB/s]
Downloading epinions-user-item-matrix-75k.zip to /content
  0% 0.00/3.46M [00:00<?, ?B/s]
100% 3.46M/3.46M [00:00<00:00, 246MB/s]


In [3]:
from sklearn import preprocessing

In [4]:
from scipy.sparse import csr_matrix
import tensorflow as tf 
import numpy as np
import tensorflow_hub as hub
import pandas as pd
import random
from keras import datasets, layers, models
from sklearn.model_selection import train_test_split

In [5]:
!unzip "/content/epinions-ratings-500k.zip" -d "/content/"
!unzip "/content/epinions-user-item-matrix-75k.zip" -d "/content/"

Archive:  /content/epinions-ratings-500k.zip
  inflating: /content/epinions_rating_500k.csv  
Archive:  /content/epinions-user-item-matrix-75k.zip
  inflating: /content/epinions_user_item_filled_75K_matrix_100k_leaning.csv  


In [6]:
df = pd.read_csv('/content/epinions_rating_500k.csv')
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,Item_ID,User_ID,rating
0,139431556,1312460676,5
1,139431556,204358,5
2,139431556,368725,5
3,139431556,277629,5
4,139431556,246386,5


In [7]:
def DatasetToUserItemDataFrame(dataframe,userID,itemID,rating):
  #Setting new item IDs from string to int 
  itemKeys = [] 
  i = 0
  for item in dataframe[itemID].value_counts(sort=False):
    temp = np.full((item),i)
    itemKeys = np.append(itemKeys,temp)
    i += 1

  #Setting new user IDs from string to int
  userKeysDic = {}
  userKeys = np.zeros((dataframe[userID].size))
  i = 0
  for user in dataframe[userID].unique():
    userKeysDic[user] = i
    i += 1
  i = 0
  for user in dataframe[userID]:
    userKeys[i] = userKeysDic[user]
    i += 1

  #Converting arrays from float to int 
  userKeys = userKeys.astype(int)
  itemKeys = itemKeys.astype(int)

  
  user_item = csr_matrix((dataframe[rating].values.astype(int),(userKeys,itemKeys))) #Creating sparse matrix
  user_item_matrix = user_item.toarray() #Converting sparse matrix into array
  df_user_item = pd.DataFrame(user_item_matrix,index = dataframe[userID].unique()  ,columns = dataframe[itemID].unique() ) 

  return user_item,df_user_item

In [8]:
def preprocess_df(df):
  users = df['User_ID'].unique()
  items = df['Item_ID'].unique()
  df_train = df.copy()

  New_User_IDs = random.sample(range(10000,50000),df['User_ID'].nunique())
  New_Item_IDs = random.sample(range(100000,200000),df['Item_ID'].nunique())
  i = 0
  for d in users :
    df_train['User_ID'].replace({d : New_User_IDs[i]}, inplace=True)
    i+=1

  i = 0
  for d in items :
    df_train['Item_ID'].replace({d : New_Item_IDs[i]}, inplace=True)
    i+=1
  
  df_train['one']=df_train['rating'].apply(lambda x: 1 if x==1 else 0)
  df_train['two']=df_train['rating'].apply(lambda x: 1 if x==2 else 0)
  df_train['three']=df_train['rating'].apply(lambda x: 1 if x==3 else 0)
  df_train['four']=df_train['rating'].apply(lambda x: 1 if x==4 else 0)
  df_train['five']=df_train['rating'].apply(lambda x: 1 if x==5 else 0)
  df_train['six']=df_train['rating'].apply(lambda x: 1 if x==6 else 0)

  return df_train

In [9]:
from numpy.core.fromnumeric import size
#GMF

def Create_GMF(num_users,num_items,SIZE_):

  input_userID = layers.Input(shape=[1], name='user_ID')
  input_itemID = layers.Input(shape=[1], name='item_ID')

  user_emb_GMF = layers.Embedding(num_users, SIZE_, name='user_emb_GMF')(input_userID)
  item_emb_GMF = layers.Embedding(num_items, SIZE_, name='item_emb_GMF')(input_itemID)

  u_GMF = layers.Flatten()(user_emb_GMF)
  i_GMF = layers.Flatten()(item_emb_GMF)

  dot_layer = layers.Multiply()([u_GMF, i_GMF])

  out_layer = layers.Dense(6, activation='softmax', name='output')(dot_layer)

  GMF = tf.keras.Model([input_userID, input_itemID], out_layer)
  
  return GMF

def Train_GMF(model,X_train,Y_train,nbrEpochs):
  model.compile(optimizer = 'adam',
                    loss = tf.keras.losses.categorical_crossentropy ,
                    metrics=['accuracy'])
  model.fit(X_train,Y_train,epochs = nbrEpochs)
  return model

def user_item_ID_lists(userIDs,itemIDs):
  i = 0
  item_s = pd.Series()
  user_s = pd.Series()
  for user in userIDs:
    temp = pd.Series(itemIDs)
    item_s = item_s.append(temp)
    temp = []
    temp = [user for item in itemIDs]
    temp = pd.Series(temp)
    user_s = user_s.append(temp)
    
    progress = user_s.size*100/(userIDs.size * itemIDs.size)
   
    if (progress  > 10) & (int(progress) < 25) & (i == 0):
      print("===== 10 % =====")
      i += 1
    elif (progress > 25) & (int(progress) < 35) & (i == 1):
      print("===== 25 % =====")
      i += 1
    elif (progress > 35) & (int(progress) < 50) & (i == 2):
      print("===== 35 % =====")
      i += 1
    elif (progress > 50) & (int(progress) < 65) & (i == 3):
      print("===== 50 % =====")
      i += 1
    elif (progress > 65) & (int(progress) < 75) & (i == 4):
      print("===== 65 % =====")
      i += 1
    elif (progress > 75) & (int(progress) < 80) & (i == 5):
      print("===== 75 % =====")
      i += 1
    elif (progress > 80) & (int(progress) < 90) & (i == 6):
      print("===== 80 % =====")
      i += 1
    elif (progress > 90) & (int(progress) < 95) & (i == 7):
      print("===== 90 % =====")
      i += 1
    elif (progress > 95) & (i == 8):
      print("===== 95 % =====")
      i += 1

  return user_s,item_s

def Fill_Cf_Matrix(model,userList,itemList,userIDs,itemIDs):

  prediction = model.predict([userList,itemList],verbose = 0)
  i = 0
  row = []
  matrix = []
  print("pred done")

  while i < userList.shape[0]:
    result = np.where(prediction[i] == np.amax(prediction[i]))[0][0] + 1
    row.append(result)
    if len(row) == itemIDs.size:
      matrix.append(row)
      row = []
       
    i += 1

  matrix_arr = np.array(matrix)
  dataframe = pd.DataFrame(matrix_arr, index = userIDs, columns = itemIDs)
  # for user in userIDs:
  #   user_s = pd.Series(user)
  #   for item in itemIDs:
  #     item_s = pd.Series(item)
  #     if dataframe.loc[user][item] == 0:
  #       prediction = model.predict([user_s,item_s],verbose = 0)
  #       result = np.where(prediction[0] == np.amax(prediction[0]))[0][0] + 1
  #       dataframe.loc[user][item] = result
  return dataframe


In [10]:
df_train = preprocess_df(df)

In [None]:
df_train

In [11]:
GMF = Create_GMF(df_train['User_ID'].max() + 1,df_train['Item_ID'].max() + 1,64)

In [76]:
df_train_sample = df_train[:30000]
X_train, X_test,y_train,y_test= train_test_split(df_train_sample[['User_ID','Item_ID']],df_train_sample[['one','two','three','four','five','six']],stratify=df_train_sample[['one','two','three','four','five','six']])
#X_train, X_test,y_train,y_test= train_test_split(df_train[['User_ID','Item_ID']],df_train['rating'],stratify=df_train['rating'])

In [13]:
GMF_trained = Train_GMF(GMF,[X_train['User_ID'],X_train['Item_ID']],y_train,5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [77]:
user_item_matrix , df_mat = DatasetToUserItemDataFrame(df_train_sample,'User_ID','Item_ID','rating')

In [19]:
GMF_trained.evaluate([X_test['User_ID'],X_test['Item_ID']],y_test)



[0.15684960782527924, 0.9495999813079834]

In [78]:
New_User_IDs = df_train_sample.User_ID.unique()
New_Item_IDs = df_train_sample.Item_ID.unique()
New_User_IDs

array([40599, 46276, 10123, ..., 34726, 26508, 15590])

In [79]:
New_Item_IDs.size * New_User_IDs.size

6234470

In [80]:
user_s,item_s = user_item_ID_lists(New_User_IDs,New_Item_IDs)



===== 10 % =====
===== 25 % =====
===== 35 % =====
===== 50 % =====
===== 65 % =====
===== 75 % =====
===== 80 % =====
===== 90 % =====
===== 95 % =====


In [81]:
df_mat_filled = Fill_Cf_Matrix(GMF_trained,user_s,item_s,New_User_IDs,New_Item_IDs)

pred done


In [82]:
df_mat_filled.head()

Unnamed: 0,182299,122045,112447,143480,135303,173399,162947,146806,133721,101681,...,120993,198900,104207,161301,160265,183349,122928,154546,103832,127876
40599,5,5,5,5,5,5,4,4,5,5,...,5,5,5,5,4,5,4,5,5,5
46276,5,5,5,5,5,3,5,4,5,5,...,5,5,4,5,4,3,4,5,5,5
10123,5,5,5,5,5,5,5,4,5,5,...,3,5,4,5,4,3,3,5,5,5
40336,5,5,4,5,5,5,5,5,5,5,...,5,5,3,5,5,4,3,5,5,5
29376,5,5,4,5,5,5,5,4,5,5,...,5,5,3,5,5,5,2,5,5,5


In [None]:
df_mat_filled1 = pd.read_csv('/content/epinions_user_item_filled_75K_matrix_100k_leaning.csv')

In [83]:
df_mat.head()

Unnamed: 0,182299,122045,112447,143480,135303,173399,162947,146806,133721,101681,...,120993,198900,104207,161301,160265,183349,122928,154546,103832,127876
40599,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,0,0
46276,5,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10123,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40336,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29376,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_mat_filled1

In [84]:
df_mat_sample =df_mat_filled

In [None]:
df_mat_sample.head()

In [None]:
def to_binary(x):
  if (x > 0):
    x = 1
  return x

In [None]:
def df_to_binary(df):
  for col in df.columns:
    df[col] = df[col].apply(to_binary)
  return df

In [None]:
df_to_binary(df_mat)
df_to_binary(df_mat_sample)

In [None]:
df_mat_sample[df_mat_sample.eq(6).any(1)]

In [94]:
#Autoencoder

encoder_input = layers.Input(shape=(df_mat_sample.shape[1]),name='user_item')
flat = layers.Flatten()(encoder_input)
#dropout = layers.Dropout(.2)(flat)
hid_encoder = layers.Dense(256,activation="relu")(flat)
# hid_encoder1 = layers.Dense(256,activation="relu")(hid_encoder)
# hid_encoder2 = layers.Dense(128,activation="relu")(hid_encoder1)
# hid_encoder3 = layers.Dense(10,activation="relu")(hid_encoder2)
encoder_output = layers.Dense(128,activation="relu")(hid_encoder)

decoder_input = layers.Dense(256,activation="relu")(encoder_output)
# decoder_hidden1 = layers.Dense(256,activation="relu")(decoder_input)
# decoder_hidden2 = layers.Dense(512,activation="relu")(decoder_hidden1)
# decoder_hidden3 = layers.Dense(25,activation="relu")(decoder_hidden2)
decoder_output = layers.Dense(df_mat_sample.shape[1],activation="relu")(decoder_input)

autoencoder = tf.keras.Model(inputs = encoder_input, outputs = decoder_output)

In [None]:
def rmse (y_true,y_pred):
  y_pred = tf.cast(y_pred, tf.float32)
  y_true = tf.cast(y_true, tf.float32)
  rmse = y_true - y_pred
  rmse = tf.square(rmse)
  rmse = tf.math.reduce_mean(rmse)
  rmse = tf.math.sqrt(rmse)
  return rmse

opt = tf.keras.optimizers.Adam(learning_rate=0.00001)

In [95]:
autoencoder.compile(optimizer ='adam',
                    loss = tf.keras.losses.MeanAbsoluteError(),
                    metrics= tf.keras.metrics.Accuracy())

In [None]:
autoencoder.summary()

In [89]:
X_train,X_test,y_train,y_test = train_test_split(df_mat,df_mat_sample)

In [None]:
X_train

In [None]:
autoencoder.fit(X_train, y_train, epochs = 2000)

In [None]:
autoencoder.evaluate(X_test,y_test)

In [None]:
df_mat

In [99]:
mat_pred = autoencoder.predict(df_mat)



In [None]:
mat_pred

In [134]:
df_mat_pred = pd.DataFrame(mat_pred, index = df_mat_sample.index, columns = df_mat_sample.columns)

In [None]:
df_mat_pred.max().max()

In [144]:
scaler = preprocessing.MinMaxScaler(feature_range=(1, 5))
d = scaler.fit_transform(df_mat_pred)
scaled_df = pd.DataFrame(d,index = df_mat_pred.index,columns = df_mat_pred.columns)
scaled_df

Unnamed: 0,182299,122045,112447,143480,135303,173399,162947,146806,133721,101681,...,120993,198900,104207,161301,160265,183349,122928,154546,103832,127876
40599,4.510973,4.432396,4.316348,2.597438,4.562206,3.664343,4.596420,2.867674,4.547574,4.298270,...,4.060149,4.562314,2.534608,4.541226,1.528912,2.489509,2.422373,4.604610,4.597714,4.511094
46276,4.516559,4.439597,4.314768,3.019688,4.572747,3.278452,4.605306,2.873655,4.549911,4.305522,...,4.218411,4.569626,2.384535,4.545772,1.557402,3.318374,2.126830,4.607321,4.601824,4.520795
10123,4.513538,4.435636,4.300281,3.133253,4.569669,3.375972,4.601048,2.870125,4.546978,4.305237,...,4.096610,4.567210,2.558227,4.544702,1.538216,3.238449,2.308135,4.605540,4.600068,4.516675
40336,4.509767,4.436713,4.287351,2.127617,4.568708,3.218948,4.603765,2.864950,4.549411,4.299565,...,2.099000,4.565569,2.501390,4.542957,1.546750,2.217687,2.120175,4.604241,4.600563,4.517136
29376,4.501371,4.418959,4.296894,4.225226,4.554612,3.324431,4.593110,2.857728,4.534137,4.290084,...,4.234924,4.552953,2.612328,4.529045,1.521717,4.332317,2.263026,4.591219,4.585756,4.505322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25878,4.500007,4.405708,4.287019,3.152949,4.553493,3.128870,4.590608,2.845322,4.534973,4.275596,...,3.680949,4.549696,2.983382,4.526473,1.532742,2.982426,2.970437,4.581278,4.585402,4.501994
49733,4.504502,4.422236,4.288691,3.573497,4.560736,4.432754,4.594699,2.862057,4.541256,4.287270,...,4.069270,4.557725,2.955292,4.538569,1.533016,3.107412,3.508357,4.591919,4.590443,4.506451
34726,4.503896,4.420152,4.273992,3.332493,4.557366,3.250015,4.590417,2.862921,4.536164,4.290868,...,4.282159,4.556949,3.146762,4.532780,1.512227,3.920702,2.771654,4.594379,4.588555,4.504986
26508,4.503896,4.420152,4.273992,3.332493,4.557366,3.250015,4.590417,2.862921,4.536164,4.290868,...,4.282159,4.556949,3.146762,4.532780,1.512227,3.920702,2.771654,4.594379,4.588555,4.504986


In [127]:
i = 0
for ind in df_mat_pred.index: 
  arr = df_mat_pred.iloc[i].to_numpy()
  count = arr[arr > 9].size
  i += 1
  if( count != 0):
    print("user "+str(i)+" has "+str(count)+" outliers")

user 31 has 1 outliers


In [141]:
for col in df_mat_pred.columns:
 df_mat_pred[col] = df_mat_pred[col].apply(lambda x: 5 if x> 5.5 else x)

In [None]:
df_mat_pred

In [None]:
res = np.where(df_mat_pred.iloc[:,:].values == 1 )
res[0].size