In [24]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

import sys
from tqdm import tqdm
sys.path.insert(0, "../src")
sys.path.insert(0, "../")
import shutil
from d2v_recommender import *
from config import config

In [2]:
recommender = D2V_Recommender()
recommender.load_rater_vec(config.rater_embeddings_path)
recommender.load_rated_vec(config.rated_embeddings_path)
print(recommender.mean_embeddings.shape, len(recommender.wv.index2word))

(135359, 1) 125474


In [14]:
train = pd.read_csv(config.train_data_path).sample(frac=1)
x_train = train.iloc[:,:2].values
y_train = train.iloc[:,2].values
train.head()


Unnamed: 0,rater,rated,m
6168461,81684,188927,0.0
14727985,32084,57368,0.0
4478599,24899,114815,0.0
9644512,104628,97280,0.0
7947499,61534,26742,0.0


In [15]:
max_rater_idx, max_rated_idx, _ = train.max()

offset_vector = np.zeros((1, config.d2v_params["embedding_size"]))
rater_embedding_matrix = np.vstack([offset_vector, np.stack(recommender.mean_embeddings.values[:,0])])

rated_id_to_emb_idx = {}
rated_embedding_matrix = np.zeros((int(max_rated_idx) + 1, config.d2v_params["embedding_size"]))
# unknown rated will have embedding of zero
for user_id_str in tqdm(recommender.wv.vocab.keys()):
    embedding_vector = recommender.wv[user_id_str]
    if embedding_vector is not None:
        user_id_int = int(user_id_str)
        rated_embedding_matrix[user_id_int] = embedding_vector

100%|██████████| 125474/125474 [00:02<00:00, 44344.49it/s]


In [16]:
 recommender.wv["10"] - rated_embedding_matrix[10], rater_embedding_matrix[10] - recommender.mean_embeddings.loc["10"].values[0]

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [36]:
# Keras model with two unmutable embedding layers
# We create them so as not to change the input data.

from keras.layers import Embedding, concatenate, Dense
from keras import Model, Input
from keras.metrics import AUC
from keras.callbacks import EarlyStopping

input_1 = Input(shape=(1,))
emb_1 = Embedding(
    int(max_rater_idx) + 1,
    config.d2v_params["embedding_size"],
    weights = [rater_embedding_matrix],
    trainable = False,
    input_length=1,
)
emb_1 = emb_1(input_1)

input_2 = Input(shape=(1,))
emb_2 = Embedding(
    int(max_rated_idx) + 1,
    config.d2v_params["embedding_size"],
    weights = [rated_embedding_matrix],
    trainable = False,
    input_length=1,
)
emb_2 = emb_2(input_2)

merge = concatenate([emb_1, emb_2])
dense1 = Dense(50, activation='relu')(merge)
dense2 = Dense(25, activation='relu')(dense1)
dense3 = Dense(1, activation="sigmoid")(dense2)

# dense = Dense


model = Model(inputs=[input_1, input_2], outputs=dense3)
model.compile(loss='binary_crossentropy', 
                optimizer='adam',
                metrics=['accuracy',AUC()])
model.summary()


Model: "functional_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 1, 100)       13536000    input_13[0][0]                   
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 1, 100)       22097100    input_14[0][0]                   
______________________________________________________________________________________

In [40]:
subset = len(x_train)+1  # all data
# subset = 100000
early_stopping = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=0, verbose=1,
    mode='auto', baseline=None, restore_best_weights=True
)
history = model.fit([x_train[:subset,0], x_train[:subset,1]], 
            y_train[:subset], 
            validation_split=0.1,
            epochs=500, 
            batch_size=128,
            callbacks=[early_stopping])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 00009: early stopping


In [38]:
plt.plot(history.history['val_auc_2'])


KeyError: 'val_auc_2'