# Train models MDF dataset

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Input, Embedding, Flatten, Concatenate, Lambda
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.model_selection import GridSearchCV, KFold
import rs_models

pd.options.display.max_columns = 1000
import warnings
warnings.filterwarnings("ignore")

In [None]:
def kfold_split(df, x, y, n_splits=6):
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

    for train_index, test_index in kf.split(df[x], df[y]):
        x_train, x_test = df[x].loc[train_index, :], df[x].loc[test_index, :]
        y_train, y_test = df[y].loc[train_index], df[y].loc[test_index]
        yield x_train, y_train, x_test, y_test


def kfold_train(model, param, context_labels=[], n_splits=2):
    """
    Train a model on n split using kfold
    model: function that returns a compiled model
    param: dictionary that contains model parameters (learning rate, epochs, batch size...)
    """
    x_labels = ['user', 'item'] 
    y_labels = 'rating'
    kfold = kfold_split(df, x_labels+context_labels, y_labels, n_splits) # generator that returns training and test index
    idx = 0

    for x_train, y_train, x_test, y_test in kfold:
        net = model(param)

        input_list = [x_train[e] for e in x_labels] # split user, item input
        input_list = [input_list + [x_train[context_labels]] if context_labels else input_list] # add context if it's available
        net.fit(input_list, y_train, epochs=param['epochs'], batch_size=param['batch_size'], verbose=False)

        input_list = [x_test[e] for e in x_labels] # same split for test values
        input_list = [input_list + [x_test[context_labels]] if context_labels else input_list]
        if idx == 0: # if it is the first fold, create results array
            results = np.array(net.evaluate(input_list, y_test, batch_size=512, verbose=False))
        else: # else add new results to array
            results = np.add(results, net.evaluate(input_list, y_test, batch_size=512, verbose=False))
        idx = idx + 1
    return results/idx


## Load dataset

In [None]:
df = pd.read_csv('MDF_final.csv')
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

item_labels = [i for i in list(df.columns) if i.find("category") == 0] # labels that describe an item
context_labels = list(set(df.iloc[:, 3:]) - set(item_labels)) # takes all the columns after user, item rating and remove item labels

n_users = df.user.nunique()
n_items = df.item.nunique()
n_contexts = len(context_labels)
    
print(f"rating with value 1: {df[df.rating == 1]['rating'].count() * 100 / len(df)} %")
print(f"users: {n_users} \t items: {n_items} \t rating: {len(df)} \t items_features: {len(item_labels)} \t contexts_features: {n_contexts} \t ")

## NeuMF

In [None]:
param = {
    'n_users': n_users,
    'n_items': n_items,
    'n_contexts': n_contexts,
    'learn_rate': 0.001,
    'batch_size': 64,
    'epochs': 8
}   


std_dev, accuracy, auc = kfold_train(rs_models.NeuMF, param, n_splits=10)
print(f"NeuMF \t accuracy: {accuracy*100}% \t AUC: {auc}")

## ECAM NeuMF

In [None]:
param = {
    'n_users': n_users,
    'n_items': n_items,
    'n_contexts': n_contexts,
    'learn_rate': 0.001,
    'batch_size': 256,
    'epochs': 7
}  

std_dev, accuracy, auc = kfold_train(rs_models.ECAM_NeuMF, param, context_labels=context_labels, n_splits=10)
print(f"ECAM NeuMF \t accuracy: {accuracy*100}% \t AUC: {auc}")

## Classifier

In [None]:
from sklearn.model_selection import cross_validate

ffnet = KerasClassifier(build_fn=rs_models.mobile_model, neurons=200, layers=4, learn_rate=0.005, epochs=10, batch_size=64, verbose=False)
x = df[item_labels+context_labels]
y = df['rating']
scores = cross_validate(ffnet, x, y, cv=KFold(shuffle=True, n_splits=10, random_state=42), scoring=['accuracy', 'roc_auc'])
accuracy = np.average(scores['test_accuracy'])
auc = np.average(scores['test_roc_auc'])
print(f"Classifier \t accuracy: {accuracy*100}% \t AUC: {auc}")