# Train models

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Input, Embedding, Flatten, Concatenate, Lambda
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier


from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split as sk_train_test_split
from sklearn.preprocessing import MinMaxScaler


import rs_models
from matplotlib import pyplot as plt
import math
from tqdm import tqdm

import implicit
from implicit.evaluation import AUC_at_k, precision_at_k, train_test_split
from implicit.als import AlternatingLeastSquares
from scipy.sparse import coo_matrix, csr_matrix


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

pd.options.display.max_columns = 1000
import warnings
warnings.filterwarnings("ignore")

## Load dataset

In [None]:
def load_dataset(name : str):
    if dataset == 'mdf':
        df = pd.read_csv('Datasets/MDF_social/MDF_with_social_features.csv')
        df_mf = pd.read_csv('Datasets/MDF_matrix_factorization.csv')
        df = df.drop(columns='time')
        df = df.drop_duplicates()
        #df = df[df.item != 2]
        # df = df.drop(['place_type_food_and_drink', 'place_type_health', 'place_type_home', 'place_type_lodging','place_type_outdoors', 'place_type_point_of_interest_establishment','place_type_public_transport_station', 'place_type_school','place_type_service', 'place_type_store', 'place_type_workplace'], axis = 1)
        df = df.reset_index(drop=True)
        context_labels = list(df.columns[3:66])
        item_labels = list(df.columns[66:92])
        user_labels = list(df.columns[92:106])
        social_labels = list(df.columns[106:])

    elif dataset == 'frappe':
        df = pd.read_csv('Datasets/frappe dataset/frappe_final.csv')
        df_mf = pd.read_csv('Datasets/frappe dataset/frappe_matrix_factorization.csv')
        context_labels = list(df.columns[3:27])
        item_labels = list(df.columns[27:54])
        user_labels = list(df.columns[54:])
        social_labels = []
    return df, df_mf, user_labels, item_labels, context_labels, social_labels

In [None]:
dataset = 'mdf'
df, df_mf, user_labels, item_labels, context_labels, social_labels = load_dataset(dataset)

n_users = df.user.nunique()
n_items = df.item.nunique()
n_contexts = len(context_labels)

print(f"rating with value 1: {df[df.rating == 1]['rating'].count() * 100 / len(df)} %")
print(f"users: {n_users} \t items: {n_items} \t rating: {len(df)}")
print(f"user_features: {len(user_labels)} \t items_features: {len(item_labels)} \t social_features: {len(social_labels)} \t contexts_features: {n_contexts} \t ")

In [None]:
n_splits = 10 # k-fold number of split
models_eval_metrics = {}

## ALS matrix factorization
Alternating least square matrix factorization from implicit library

https://implicit.readthedocs.io/en/latest/als.html

In [None]:
ratings = coo_matrix((df_mf['rating'].astype(np.float32),
                     (df_mf['item'],
                      df_mf['user']))).tocsr()

auc = 0
train, test = train_test_split(ratings, train_percentage=0.80)
for split in range(n_splits):
    model = AlternatingLeastSquares(factors=64, regularization=10, iterations=1, calculate_training_loss=True)
    model.fit(train, show_progress=False)
    auc = auc + rs_models.mf_AUC(model, train, test)
auc = auc / n_splits
print(f"ALS \t AUC: {auc}")
models_eval_metrics['ALS'] = [0, auc, 0, 0]

In [None]:
n_splits = 10
ratings = coo_matrix((df['rating'].astype(np.float32),
                     (df['item'],
                      df['user']))).tocsr()

auc = 0
train, test = train_test_split(ratings, train_percentage=0.80)
for split in range(n_splits):
    model = AlternatingLeastSquares(factors=128, regularization=5, iterations=10, calculate_training_loss=True)
    model.fit(train, show_progress=False)
    auc = auc + rs_models.mf_AUC2(model, train, test)
auc = auc / n_splits
print(f"ALS \t AUC: {auc}")
models_eval_metrics['ALS'] = [0, auc, 0, 0]

## NeuMF

In [None]:
param = {
    'n_users': n_users,
    'n_items': n_items,
    'n_contexts': n_contexts,
    'learn_rate': 0.001,
    'batch_size': 256,
    'epochs': 10
}   


std_dev, accuracy, auc, precision, recall = rs_models.kfold_train(rs_models.NeuMF, param, df, n_splits=n_splits)
models_eval_metrics['NeuMF'] = [accuracy, auc, precision, recall]
print(f"NeuMF \t accuracy: {accuracy*100}% \t AUC: {auc} \t precision: {precision} \t recall: {recall}")

## ECAM NeuMF
NeuMF model that takes as input also a physical context vector

In [None]:
param = {
    'n_users': n_users,
    'n_items': n_items,
    'n_contexts': n_contexts,
    'learn_rate': 0.001,
    'batch_size': 256,
    'epochs': 10
}  

std_dev, accuracy, auc, precision, recall = rs_models.kfold_train(rs_models.ECAM_NeuMF, param, df, context_labels=context_labels, n_splits=n_splits)
models_eval_metrics['ECAM NeuMF'] = [accuracy, auc, precision, recall]
print(f"ECAM NeuMF \t accuracy: {accuracy*100}% \t AUC: {auc} \t precision: {precision} \t recall: {recall}")

## Classifier

In [None]:
ffnet = KerasClassifier(build_fn=rs_models.mobile_model, neurons=100, layers=3, learn_rate=0.005, epochs=10, batch_size=128, verbose=False)
x = df[item_labels + user_labels + social_labels + context_labels]
y = df['rating']

scores = cross_validate(ffnet, x, y, cv=KFold(shuffle=True, n_splits=n_splits, random_state=42), scoring=['accuracy', 'roc_auc', 'precision', 'recall'])

accuracy = np.average(scores['test_accuracy'])
auc = np.average(scores['test_roc_auc'])
precision = np.average(scores['test_precision'])
recall = np.average(scores['test_recall'])
models_eval_metrics['Classifier'] = [accuracy, auc, precision, recall]

print(f"Classifier \t accuracy: {accuracy*100}% \t AUC: {auc} \t precision: {precision} \t recall: {recall}")

## Plot all models results
Plot AUC of ALS, NeuMF, ECAM NeuMF and classifier.

The classifier use the following features: user, item, physical context, social context

In [None]:
n_models = len(models_eval_metrics) # number of different models
models_name = [x[0] for x in models_eval_metrics.items()] 
accuracy = [x[0] for x in models_eval_metrics.values()]
auc = [x[1] for x in models_eval_metrics.values()]
precision = [x[2] for x in models_eval_metrics.values()]
recall = [x[3] for x in models_eval_metrics.values()]

index = np.arange(n_models)
bar_width = 0.50
plt.figure(figsize=(10,6))

plt.bar(index, auc, bar_width, color='#408ec6', label='AUC')

for i, value in enumerate(auc): # add metric value at the top of the bar
    plt.text(i-bar_width/4, value + 0.01, str(round(value, 4))) # parameters are x position, y position, value
    
#plt.style.use("fivethirtyeight")
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('MDF prediction results')
plt.xticks(index, models_name) # labels position
plt.yticks(np.arange(0, 1., 0.1))
plt.legend()
plt.grid(axis = 'y', linestyle = '--', linewidth = 1)
plt.savefig(dataset + '_test_results.pdf', format='pdf', bbox_inches='tight')
plt.show()

## Classifier on all users

In [None]:
load_path = 'Datasets/MDF_social/social_datasets/'

single_df_auc = models_eval_metrics['Classifier'][1] # retreive AUC of the classifier trained on the dataset without layer feature
multi_df_auc = 0

for user in tqdm(range(31)):
    df = pd.read_csv(f'Datasets/MDF_social/social_datasets/MDF_user{user}.csv')
    df = df.drop(columns='time')
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    
    x = df.iloc[:, 3:]
    y = df['rating']
    
    ffnet = KerasClassifier(build_fn=rs_models.mobile_model, neurons=100, layers=3, learn_rate=0.01, epochs=20, batch_size=256, verbose=False)
    scores = cross_validate(ffnet, x, y, cv=KFold(shuffle=True, n_splits=2, random_state=42), scoring=['roc_auc'])
    multi_df_auc = multi_df_auc + np.average(scores['test_roc_auc'])
    print(np.average(scores['test_roc_auc']))

### Plot single model vs one model per user
Plot AUC of the classifier trained on one dataset (the same used above) vs average AUC of 31 classifier (one per user). Users datasets have a new feature called layer

In [None]:
models_name = ['Single dataset', 'One dataset per user']
auc = [single_df_auc, multi_df_auc/31]

index = np.arange(len(models_name))
bar_width = 0.20
plt.figure(figsize=(10,6))

plt.bar(index, auc, bar_width, color='#408ec6', label='AUC')

for i, value in enumerate(auc): # add metric value at the top of the bar
    plt.text(i-bar_width/5, value + 0.01, str(round(value, 4))) # parameters are x position, y position, value
    
plt.style.use("default")
plt.xlabel('Datasets')
plt.ylabel('Scores')
plt.title('AUC on users dataset')
plt.xticks(index, models_name) # labels position
plt.legend(bbox_to_anchor=(0.55, 1))
plt.grid(axis = 'y', linestyle = '--', linewidth = 1)
plt.savefig('single_vs_users_datasets.pdf', format='pdf', bbox_inches='tight')
plt.show()

## Train Classifier only on some features
Train the classifier on parts of the available features:
- user + item
- user + item + social context
- user + item + physical context
- All available features

In [None]:
train_labels = [item_labels+user_labels, 
                item_labels+user_labels+social_labels, 
                item_labels+user_labels+context_labels,
                item_labels+user_labels+social_labels+context_labels]

parameters = [{'neurons': 100, 'layers': 3, 'learn_rate': 0.001, 'epochs': 30, 'batch_size': 256},
              {'neurons': 100, 'layers': 3, 'learn_rate': 0.001, 'epochs': 30, 'batch_size': 64},
              {'neurons': 100, 'layers': 3, 'learn_rate': 0.01, 'epochs': 30, 'batch_size': 256},
              {'neurons': 100, 'layers': 3, 'learn_rate': 0.01, 'epochs': 20, 'batch_size': 256}]
results = []

y = df['rating']
for x_labels, params in zip(train_labels, parameters):
    x = df[x_labels]
    ffnet = KerasClassifier(build_fn=rs_models.mobile_model, **params, verbose=False)
    scores = cross_validate(ffnet, x, y, cv=KFold(shuffle=True, n_splits=5, random_state=42), scoring=['accuracy', 'roc_auc', 'precision', 'recall'])
    auc = np.average(scores['test_roc_auc'])
    print(dataset, auc)
    results.append(auc)

### Plot classifier results on different features

In [None]:
models_name = ['U, I', 'U, I, S', 'U, I, P', 'U, I, S, P']
index = np.arange(len(results))
bar_width = 0.50
plt.figure(figsize=(10,6))


plt.bar(index, results, bar_width, color='#408ec6', label='AUC')

for i, value in enumerate(results): # add metric value at the top of the bar
    plt.text(i-bar_width/3, value + 0.01, str(round(value, 4))) # parameters are x position, y position, value
    
plt.style.use("default")
plt.xlabel('Features')
plt.ylabel('Scores')
plt.title('moveCARS on different features')
plt.xticks(index, models_name) # labels position
plt.legend()
plt.text(3.50,0.44, 'U: user\nI: item\nP: physical context\nS: social context', color='black', 
         bbox=dict(facecolor='none', edgecolor='grey', boxstyle='round, pad=0.5'))
plt.grid(axis = 'y', linestyle = '--', linewidth = 1)
plt.savefig('moveCARS_diff_features.pdf', format='pdf', bbox_inches='tight')
plt.show()

## Convert to TFlite

In [None]:
# TFlite can't convert models with dynamic input shape, this model has a fixed input_dim
def mobile_model_fixed_shape(neurons, layers, learn_rate):
    model = Sequential()
    model.add(Dense(neurons, input_dim=107, activation='relu'))
    for x in range(layers):
        model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()], optimizer=Adam(lr=learn_rate))
    return model

In [None]:
model = mobile_model_fixed_shape(100, 3, 0.01)
model.summary()

In [None]:
def model_to_tflite(model, name, x, y):
    model.fit(x=x, y=y, epochs=10, batch_size=128)
    model.save(f'saved_models/{name}') # save model to file
    converter = tf.lite.TFLiteConverter.from_saved_model(f'saved_models/{name}')
    tflite_model = converter.convert() # convert to tflite
    with open(f'saved_models/{name}.tflite', 'wb') as f: # save tflite model on file
      f.write(tflite_model)

In [None]:
model = mobile_model_fixed_shape(100, 3, 0.01)
model_to_tflite(model, 'mobile', df[user_labels+item_labels+context_labels+social_labels], y=df['rating'])

In [None]:
param = {
    'n_users': n_users,
    'n_items': n_items,
    'n_contexts': n_contexts,
    'learn_rate': 0.001,
} 

model = rs_models.NeuMF(param)
model_to_tflite(model, 'NeuMF', [df['user'], df['item']], y=df['rating'])

In [None]:
model = rs_models.ECAM_NeuMF(param)
model_to_tflite(model, 'ECAM_NeuMF', [df['user'], df['item'], df[context_labels]], y=df['rating'])