
## Concept
- sampling user-item to form pairs: 
    [U1,I3]: T
    [U2,I9]: F
    ...
- architecture:
    user-embedding->
    item-embedding-> aggregation -> T/F
    
- embedding layer: no need to one-hot processing
    
- U/I be one-hot, but sometimes can has attributes    

### architecture (pairwise model) example: 
CBOW, Rank , skip-gram ...

### advantage:
- can skip null value

### disadvantage:
- dataset become very large


### Content based:
e.g. use statistics dattribute 


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding , Input , Activation , Dense
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pickle ,  gc 
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error as mse

In [2]:
def load_data(fn):
    """
    user id | item id | rating | timestamp. 
    """
    with open(fn , 'r' , encoding = 'utf-8') as f:
        data = f.read()
    data=[row.split('\t') for row in data.split("\n") if row != '']
    x_u = [int(row[0]) for row in data]
    x_i = [int(row[1]) for row in data]
    x = [np.array(x_u), np.array(x_i)]
    y = np.array([float(row[2])-1 for row in data])
    return x,y

def meta(x,y,title='train'):
    user = set(x[0])
    item = set(x[1])
    y = set([int(row) for row in y])
    print(f"""
    type: {title} 
=============================
num user: {len(user)}
max userId: {max(user)}

num item: {len(item)}
max itemId: {max(item)}
min itemId: {min(item)}
    """)
    print('y set: ', y)
    return max(user) , max(item) , max(y)

In [3]:
traindir = 'MovieLens/u1.base' 
testdir ='MovieLens/u1.test'

x_train,y_train = load_data(traindir)
x_test ,y_test = load_data(testdir)

max_u , max_i, max_y = meta(x_train,  y_train)
tmp_u , tmp_i, tmp_y = meta(x_test, y_test , 'test')
max_u = max(max_u , tmp_u)
max_i = max(max_i , tmp_i)
max_y = max(max_y , tmp_y)


    type: train 
num user: 943
max userId: 943

num item: 1650
max itemId: 1682
min itemId: 1
    
y set:  {0, 1, 2, 3, 4}

    type: test 
num user: 459
max userId: 462

num item: 1410
max itemId: 1591
min itemId: 1
    
y set:  {0, 1, 2, 3, 4}


In [4]:
def get_model(max_u= max_u+1 , max_i = max_i+1 , max_y=max_y+1 ):
    input_u = Input(shape=(1,))
    input_i = Input(shape=(1,))
    
    emb_user = Embedding(max_u+1 , 128)
    emb_item = Embedding(max_i+1 , 128)
    linear = Dense( 128, activation='relu')
    linear2 = Dense( 64, activation='relu')
#     logit = Dense(max_y, activation= 'softmax')
    logit = Dense(1)
#     output  = Dense(1 )
    
    latent_u = emb_user(input_u)
    latent_i = emb_item(input_i)
    latent = tf.concat([latent_u , latent_i] , axis=-1)
    latent = linear(latent)
    latent = linear2(latent)
    output = logit(latent)
#     output
    model = Model(inputs = [input_u , input_i] , outputs = output)
    model.summary()
    return model
model = get_model()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 128)       120960      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 128)       215552      input_2[0][0]                    
______________________________________________________________________________________________

In [12]:
# checkpoint = ModelCheckpoint(
#     filepath='.',
#     save_weights_only=True,
#     monitor='val_loss',
#     mode='min',
#     save_best_only=True)
earlystop = EarlyStopping(monitor='val_loss',
                          patience = 2 , 
                          mode='auto')

In [13]:

# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
model.compile(loss = 'mse',
             optimizer='adam',
             metrics=['acc'])

In [14]:
history = model.fit(x=x_train,
          y=y_train ,
          validation_data = (x_test , y_test),
          epochs = 2000 , batch_size= 32 , verbose = 2,
         callbacks=[ earlystop])

Epoch 1/2000
2500/2500 - 4s - loss: 1.0259 - acc: 0.1152 - val_loss: 0.9246 - val_acc: 0.1155
Epoch 2/2000
2500/2500 - 4s - loss: 0.8734 - acc: 0.1172 - val_loss: 0.9254 - val_acc: 0.1166
Epoch 3/2000
2500/2500 - 4s - loss: 0.8332 - acc: 0.1190 - val_loss: 0.8976 - val_acc: 0.1138
Epoch 4/2000
2500/2500 - 4s - loss: 0.7763 - acc: 0.1208 - val_loss: 0.8973 - val_acc: 0.1151
Epoch 5/2000
2500/2500 - 3s - loss: 0.7043 - acc: 0.1230 - val_loss: 0.9031 - val_acc: 0.1195
Epoch 6/2000
2500/2500 - 3s - loss: 0.6286 - acc: 0.1253 - val_loss: 0.9357 - val_acc: 0.1175


In [None]:
discretize = lambda x: np.floor(x+.5)

y_pred = model.predict(x_test)
# y_pred =np.argmax(y_pred.reshape(-1,5) , axis = -1)
y_pred = discretize(y_pred)
rmse = mse( y_test , y_pred , squared = False)
print(rmse)

In [108]:
np.floor([.5 ,3.4])

array([0., 3.])

In [None]:
plt.hist(y_train)