In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

import sys
from tqdm import tqdm
sys.path.insert(0, "../src")
sys.path.insert(0, "../")
import shutil
from d2v_recommender import *
from config import config

In [2]:
train = pd.read_csv(config.train_data_path).sample(frac=1)
x_train = train.iloc[:,:2].values
y_train = train.iloc[:,2].values
train.head()


Unnamed: 0,rater,rated,m
6168461,81684,188927,0.0
14727985,32084,57368,0.0
4478599,24899,114815,0.0
9644512,104628,97280,0.0
7947499,61534,26742,0.0


In [3]:
max_rater_idx, max_rated_idx, _ = train.max()

In [18]:
# Keras model with two unmutable embedding layers
# We create them so as not to change the input data.

from keras.layers import Embedding, concatenate, Dense
from keras import Model, Input
from keras.metrics import AUC
from keras.callbacks import EarlyStopping

input_1 = Input(shape=(1,))
# emb_1 = Embedding(
#     int(max_rater_idx) + 1,
#     config.d2v_params["embedding_size"],
#     trainable = True,
#     input_length=1,
# )
# emb_1 = emb_1(input_1)

input_2 = Input(shape=(1,))
emb = Embedding(
    int(max_rated_idx) + 1,
    config.d2v_params["embedding_size"],
    trainable = True,
    input_length=1,
)
emb_1_layer = emb(input_1)
emb_2_layer = emb(input_2)

merge = concatenate([emb_1_layer, emb_2_layer])
dense1 = Dense(50, activation='relu')(merge)
dense2 = Dense(25, activation='relu')(dense1)
dense3 = Dense(1, activation="sigmoid")(dense2)

# dense = Dense


model = Model(inputs=[input_1, input_2], outputs=dense3)
model.compile(loss='binary_crossentropy', 
                optimizer='adam',
                metrics=['accuracy',AUC()])
model.summary()


Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 100)       22097100    input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 1, 200)       0           embedding_4[0][0]     

In [19]:
subset = len(x_train)+1  # all data
early_stopping = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=0, verbose=1,
    mode='auto', baseline=None, restore_best_weights=True
)
history = model.fit([x_train[:subset,0], x_train[:subset,1]], 
            y_train[:subset], 
            validation_split=0.1,
            epochs=500, 
            batch_size=128,
            callbacks=[early_stopping])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: smac attempted to use a functionality that requires module emcee, but it couldn't be loaded. Please install emcee and retry.
Epoch 1/500
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: smac attempted to use a functionality that requires module emcee, but it couldn't be loaded. Please install emcee and retry.
  1397/109853 [..............................] - ETA: 10:07:51 - loss: 0.4174 - accuracy: 0.8291 - auc_2: 0.7933

KeyboardInterrupt: 

In [None]:
# Save the model
model.save(config.keras_model_no_pretraining)

In [None]:
plt.plot(range(1, 7), history.history['val_auc'], "g", range(1, 7), history.history['auc'], "r")
plt.legend(["Val","Train"])
plt.axvline(x=5)
plt.xlabel("epochs")
plt.ylabel("ROC AUC (val_set)")


In [None]:
# evaluation on test set
test = pd.read_csv(config.test_data_path).sample(frac=1)
x_test = test.iloc[:,:2].values
y_test = test.iloc[:,2].values
results = model.evaluate([x_test[:,0], x_test[:,1]], y_test, batch_size=128)
print("test loss, test acc:", results)

In [None]:
predictions = model.predict([x_test[:,0], x_test[:,1]])
predictions.shape

In [None]:
from sklearn.metrics import roc_curve
#  https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/#:~:text=with%20sample%20code).-,ROC%20Curves%20and%20AUC%20in%20Python,probabilities%20for%20the%201%20class.

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, np.random.random(y_test.shape))
lr_fpr, lr_tpr, _ = roc_curve(y_test, predictions.reshape(-1,1))
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Random Classifier')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# plt the legend
plt.legend()
# show the plot
plt.show()