# Lesson 6b: Factorization Machines with Keras

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Factorization Machine Model

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p /content/drive/MyDrive/Kaggle_competition/model


Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pathlib import Path
from zipfile import ZipFile
from tensorflow.keras import regularizers




In [4]:
## FACTORIZATION MODELS

# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-1m.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-1m"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")


def load_ratings(movielens_dir):
    COL_NAME = ['uid','mid','rating','timestamp']
    df = pd.read_csv(movielens_dir / 'ratings.dat',sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies(movielens_dir):
    COL_NAME = ['mid','movie_name','movie_genre']
    df = pd.read_csv(movielens_dir / 'movies.dat',sep='::', header=None, engine='python', names=COL_NAME, encoding='latin-1')
    return df

def load_users(movielens_dir):
    COL_NAME = ['uid','user_fea1','user_fea2','user_fea3','user_fea4']
    df = pd.read_csv(movielens_dir / 'users.dat',sep='::', header=None, engine='python', names=COL_NAME)
    return df

def text2seq(text, n_genre):
    """ using tokenizer to encoded the multi-level categorical feature
    """
    tokenizer = Tokenizer(lower=True, split='|',filters='', num_words=n_genre)
    tokenizer.fit_on_texts(text)
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=3,padding='post')
    return seq

n_genre = 15


ratings = load_ratings(movielens_dir)
movies = load_movies(movielens_dir)
users = load_users(movielens_dir)


print("====== rating.dat ======")
print(ratings.head())
print("===== movies.dat ======")
print(movies.head())
print("====== users.dat ======")
print(users.head())

movies['movie_genre'] = text2seq(movies.movie_genre.values, n_genre=n_genre).tolist()

ratings = pd.merge(pd.merge(ratings, users), movies)

print("====== preprocessed data =======")
ratings.head()

Downloading data from http://files.grouplens.org/datasets/movielens/ml-1m.zip
Extracting all the files now...
Done!
   uid   mid  rating  timestamp
0    1  1193       5  978300760
1    1   661       3  978302109
2    1   914       3  978301968
3    1  3408       4  978300275
4    1  2355       5  978824291
   mid                          movie_name                   movie_genre
0    1                    Toy Story (1995)   Animation|Children's|Comedy
1    2                      Jumanji (1995)  Adventure|Children's|Fantasy
2    3             Grumpier Old Men (1995)                Comedy|Romance
3    4            Waiting to Exhale (1995)                  Comedy|Drama
4    5  Father of the Bride Part II (1995)                        Comedy
   uid user_fea1  user_fea2  user_fea3 user_fea4
0    1         F          1         10     48067
1    2         M         56         16     70072
2    3         M         25         15     55117
3    4         M         45          7     02460
4    5   

Unnamed: 0,uid,mid,rating,timestamp,user_fea1,user_fea2,user_fea3,user_fea4,movie_name,movie_genre
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),"[1, 0, 0]"


### Let us extract some additional features in order to have a better insight

In [None]:
# We extract from the dataset some information: we extract the year of the movie (if missing we replace with the mean)
# We extract the last movie seen from the user, the amount of movie saw, we convert the sex in 1 and 0, we normalize the year


In [5]:
import re
from sklearn.preprocessing import StandardScaler
normal_transf= StandardScaler()
def preprocessing_and_extracting(df):

    df['user_fea1'] = df['user_fea1'].replace('M',1)
    df['user_fea1'] = df['user_fea1'].replace('F',0)
    y = re.compile(r'\((\d{4})\)')
    years = [int(y.findall(title)[0]) if y.findall(title) else None for title in df['movie_name']]
    mean_year = sum(year for year in years if year is not None) / len([year for year in years if year is not None]) # we take a mean year to impute for missing years
    df['year'] = [year if year is not None else int(mean_year) for year in years]
    # df['last_seen']=df['timestamp'].apply(lambda x: 1 if x in set_of_timestamp else 0)
    df_grouped = df.groupby('uid').agg({'timestamp': 'max'}).reset_index()
    set_of_timestamp=df_grouped['timestamp'].values
    df['last_seen'] = df['timestamp'].isin(set_of_timestamp).astype(int) # we mark the last movie
    mean_ratings = df.groupby('uid')['rating'].transform('mean')
    df['weight_rating'] = np.where(df['rating'] > mean_ratings, 1, -1)  # we mark with 1 movies with high rating and -1 the others

    df['age_difference'] = abs(df['user_fea2'] - (2003 - df['year']))

    df['user_fea2'] = (df['user_fea2']-df['user_fea2'].mean())/(df['user_fea2'].std()) # normalizing the year

    max_timestamp= ratings.groupby('uid')['timestamp'].transform('max')
    df['diff_timestamp'] = df['timestamp'] - max_timestamp
    df['count_ratings']= ratings.groupby('uid')['rating'].transform('count').values

    #NORMALIZATION OF NUMERICAL VALUES

    df['age_difference'] = normal_transf.fit_transform(df['age_difference'].values.reshape(-1, 1))

    df['diff_timestamp'] = normal_transf.fit_transform(df['diff_timestamp'].values.reshape(-1, 1))
    df['count_ratings'] = normal_transf.fit_transform(df['count_ratings'].values.reshape(-1, 1))
    df['user_fea3'] = normal_transf.fit_transform(df['user_fea3'].values.reshape(-1, 1))


    return df



In [22]:
ratings=preprocessing_and_extracting(ratings)

## Define input layers
The dataset contains a **numeric** and **categerical** features, they need to be treated differently.

* **numeric features** can be concatenated to inputs, with shape (None, num_of_numeric)
* **categorical features** can be encoded individually to inputs, with shape (None, 1) each.

In [6]:
# @title Testo del titolo predefinito
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

def define_input_layers():
    # numerical features
    fea3_input = Input((1,), name='input_fea3')
    fea2_input = Input((1,), name='input_fea2')
    year_input = Input((1,), name='input_year')
    #weight_rating_input = Input((1,), name='input_weight_rating')
    age_difference_input = Input((1,), name='input_age_difference')
     #diff_timestamp_input = Input((1,), name='input_diff_timestamp')
    #count_ratings_input = Input((1,), name='input_count_ratings')

    num_inputs = [fea3_input, fea2_input, year_input, age_difference_input]

    # single level categorical features
    uid_input = Input((1,), name='input_uid')  # user_id
    last_seen_input = Input((1,), name='input_last_seen')
    mid_input = Input((1,), name='input_mid')  # movie_id
    cat_sl_inputs = [uid_input, last_seen_input, mid_input]

    # multi level categorical features (with 3 genres at most)
    genre_input = Input((3,), name='input_genre')
    cat_ml_inputs = [genre_input]

    inputs = num_inputs + cat_sl_inputs + cat_ml_inputs

    return inputs

inputs = define_input_layers()


## 1st order factorization machines
1st order will require features to map to a scalar. so for:

* numeric feature: a dense layer will convert tensor to shape (None,1)
* categorical feature: a embedding layer will convert tensor to shape (None,1,1) and then reshape layer to reshape to (None,1)

In [7]:
def Tensor_Mean_Pooling(name = 'mean_pooling', keepdims = False):
    return Lambda(lambda x: K.mean(x, axis = 1, keepdims=keepdims), name = name)

def fm_1d(inputs, n_uid, n_mid, n_genre, reg_weight):

    # user feat3 + user embedding + movie embedding + genre embedding
    fea3_input, fea2_input, year_input, age_difference_input, uid_input, last_seen_input, mid_input, genre_input = inputs

    # all tensors are reshape to (None, 1)
    num_dense_1d = [Dense(1, name='num_dense_1d_fea3')(fea3_input),
                    Dense(1, name='num_dense_1d_fea2')(fea2_input),
                    Dense(1, name='num_dense_1d_year')(year_input),
                    #Dense(1, name='num_dense_1d_last_seen')(last_seen_input),
                    #Dense(1, name='num_dense_1d_weight_rating')(weight_rating_input),
                    Dense(1, name='num_dense_1d_age_difference')(age_difference_input)]
                    #Dense(1, name='num_dense_1d_diff_timestamp')(diff_timestamp_input),
                    #Dense(1, name='num_dense_1d_count_ratings')(count_ratings_input)]

    cat_sl_embed_1d = [Embedding(n_uid + 1, 1, name = 'cat_embed_1d_uid',  embeddings_regularizer=regularizers.l2(reg_weight))(uid_input),
                       Embedding(n_uid + 1, 1, name='cat_embed_1d_lastseen',  embeddings_regularizer=regularizers.l2(reg_weight))(last_seen_input),
                       Embedding(n_mid + 1, 1, name = 'cat_embed_1d_mid',  embeddings_regularizer=regularizers.l2(reg_weight))(mid_input)]
    cat_ml_embed_1d = [Embedding(n_genre + 1, 1, mask_zero=True, name = 'cat_embed_1d_genre', embeddings_regularizer=regularizers.l2(reg_weight))(genre_input)]

    cat_sl_embed_1d = [Reshape((1,))(i) for i in cat_sl_embed_1d]
    cat_ml_embed_1d = [Tensor_Mean_Pooling(name = 'embed_1d_mean')(i) for i in cat_ml_embed_1d]

    # add all tensors
    y_fm_1d = Add(name = 'fm_1d_output')(num_dense_1d + cat_sl_embed_1d + cat_ml_embed_1d)

    return y_fm_1d

#y_1d = fm_1d(inputs, 10, 10, 10)

## 2nd order factorization machines
In 2nd order FM, each feature is map to shape (None, 1, k) and then stack to concat_embed_2d layer with shape (None, p, k).
k - matrix factorization latent dimension, p is feature dimension.

the calculation of interaction terms can be simplified, using
\begin{equation*} \sum{x_ix_j} = \frac{1}{2} \left((\sum{x})^2 - \sum({x}^2)\right) \end{equation*}

Hence, the sum of 2nd order interactions = square of sum of concat_embed_2d - sum of squared concat_embed_2d in p dimension, the resulting tensor will have a shape (None, k)



In [8]:
def fm_2d(inputs, n_uid, n_mid, n_genre, k, reg_weight):

    fea3_input, fea2_input, year_input, age_difference_input, uid_input, last_seen_input, mid_input, genre_input = inputs

    num_dense_2d = [Dense(k, name='num_dense_2d_fea3')(fea3_input), # Shape (none,1)
                    Dense(k, name='num_dense_2d_fea2')(fea2_input),
                    Dense(k, name='num_dense_2d_year')(year_input),
                    #Dense(k, name='num_dense_2d_last_seen')(last_seen_input),
                    #Dense(k, name='num_dense_2d_weight_rating')(weight_rating_input),
                    Dense(k, name='num_dense_2d_age_difference')(age_difference_input)]
                    #Dense(k, name='num_dense_2d_diff_timestamp')(diff_timestamp_input),
                    #Dense(k, name='num_dense_2d_count_ratings')(count_ratings_input)]

    num_dense_2d = [Reshape((1, k))(i) for i in num_dense_2d] #shape (None, 1 , k)


    cat_sl_embed_2d = [Embedding(n_uid + 1, k, name = 'cat_embed_2d_uid',  embeddings_regularizer=regularizers.l2(reg_weight))(uid_input),
                       Embedding(n_uid + 1, k, name='cat_embed_2d_lastseen', embeddings_regularizer=regularizers.l2(reg_weight))(last_seen_input),
                       Embedding(n_mid + 1, k, name = 'cat_embed_2d_mid',  embeddings_regularizer=regularizers.l2(reg_weight))(mid_input)] # shape (None, 1, k)

    cat_ml_embed_2d = [Embedding(n_genre + 1, k, name = 'cat_embed_2d_genre',embeddings_regularizer=regularizers.l2(reg_weight))(genre_input)] # shape (None, 3, k)
    cat_ml_embed_2d = [Tensor_Mean_Pooling(name = 'cat_embed_2d_genure_mean', keepdims=True)(i) for i in cat_ml_embed_2d] # shape (None, 1, k)

    # concatenate all 2d embed layers => (None, ?, k)
    embed_2d = Concatenate(axis=1, name = 'concat_embed_2d')(num_dense_2d + cat_sl_embed_2d + cat_ml_embed_2d)

    # calcuate the interactions by simplication
    # sum of (x1*x2) = sum of (0.5*[(xi)^2 - (xi^2)])
    tensor_sum = Lambda(lambda x: K.sum(x, axis = 1), name = 'sum_of_tensors')
    tensor_square = Lambda(lambda x: K.square(x), name = 'square_of_tensors')

    sum_of_embed = tensor_sum(embed_2d)
    square_of_embed = tensor_square(embed_2d)

    square_of_sum = Multiply()([sum_of_embed, sum_of_embed])
    sum_of_square = tensor_sum(square_of_embed)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x*0.5)(sub)
    y_fm_2d = Reshape((1,), name = 'fm_2d_output')(tensor_sum(sub))

    return y_fm_2d, embed_2d

## Put together

In [24]:
def fm_model(n_uid, n_mid, n_genre, k, dnn_dr, reg_weight):

    inputs = define_input_layers()

    y_fm_1d = fm_1d(inputs, n_uid, n_mid, n_genre, reg_weight)
    y_fm_2d, embed_2d = fm_2d(inputs, n_uid, n_mid, n_genre, k, reg_weight)


    # combinded deep and fm parts
    y = Concatenate()([y_fm_1d, y_fm_2d])
    y = Dense(1, name = 'fm_output')(y)

    fm_model_1d = Model(inputs, y_fm_1d)
    fm_model_2d = Model(inputs, y_fm_2d)
    fm_model = Model(inputs, y)

    return fm_model_1d, fm_model_2d, fm_model

In [25]:
params = {
    'n_uid': ratings.uid.max(),
    'n_mid': ratings.mid.max(),
    'n_genre': 14,
    'k': 30,
    'dnn_dr': 0.5,
    'reg_weight' : 0.01
}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

In [11]:
params

{'n_uid': 6040,
 'n_mid': 3952,
 'n_genre': 14,
 'k': 20,
 'dnn_dr': 0.5,
 'reg_weight': 0.01}

## Prepare Data

### Split Data

In [27]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(ratings, test_size=0.2, random_state=7)

In [28]:
import numpy as np

def df2xy(ratings):
    x = [ratings.user_fea2.values,
         ratings.user_fea3.values,
         ratings.year.values,
         #ratings.weight_rating.values,
         ratings.age_difference.values,
         #ratings.diff_timestamp.values,
         #ratings.count_ratings.values,
         ratings.uid.values,
         ratings.last_seen.values,
         ratings.mid.values,
         np.concatenate(ratings.movie_genre.values).reshape(-1, 3)]
    y = ratings.rating.values
    return x, y

train_x, train_y = df2xy(train)
valid_x, valid_y = df2xy(val)


## Train Model

In [29]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

fm_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.001)
)

model_checkpoint_path = '/content/drive/MyDrive/Kaggle_competition/model/deepfm_weights.best'

early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath=model_checkpoint_path,
                            monitor='val_loss',
                            save_weights_only=True,
                            save_best_only=True)

callbacks = [model_ckp, early_stop]

train_history = fm_model.fit(train_x, train_y,
                              epochs=30, batch_size=64,
                              validation_data=(valid_x, valid_y),
                              callbacks=callbacks)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


### Predictions

In [30]:
test =pd.read_csv('/content/drive/My Drive/Kaggle_competition/kaggle_baseline.csv')

# I have to produce the output for all of these users.


In [31]:
def create_test_dataset(user_id, ratings):

    movies_seen_by_user = ratings[ratings['uid'] == user_id]['mid'].unique()

    all_movies_except_seen = ratings[~ratings['mid'].isin(movies_seen_by_user)]


    test_dataset = pd.DataFrame({'uid': [user_id] * len(np.unique(all_movies_except_seen['mid'].values)),
                                 'mid': np.unique(all_movies_except_seen['mid'].values)})

    # We retrieve additional information
    movies_info = ratings[~ratings['mid'].isin(movies_seen_by_user)][['mid', 'movie_name', 'movie_genre', 'timestamp']]

    movies_info = movies_info.drop_duplicates(subset=['mid'])

    test_dataset = pd.merge(test_dataset, movies_info, on='mid', how='left')

    # Extract the year
    test_dataset['year'] = ratings['movie_name'].str.extract(r'\((\d{4})\)')
    test_dataset['year'] = test_dataset['year'].astype(float)

    # And fill missing year
    mean_year = test_dataset['year'].mean()
    test_dataset['year'] = test_dataset['year'].fillna(mean_year).astype(int)

    # Calcola la feature last_seen
    #last_seen = ratings.groupby('uid')['timestamp'].max().reset_index()
    #last_seen.rename(columns={'timestamp': 'last_seen'}, inplace=True)
    #test_dataset = pd.merge(test_dataset, last_seen, on='uid', how='left')
    #test_dataset['last_seen'] = test_dataset['last_seen'].notnull().astype(int)
    test_dataset['last_seen']= np.zeros(len(np.unique(all_movies_except_seen['mid'].values))) # Because he never saw these movies

    # Retreive user_fea2 and user_fea3
    test_dataset['user_fea2'] = ratings[ratings['uid'] == user_id]['user_fea2'].iloc[0]
    test_dataset['user_fea3'] = ratings[ratings['uid'] == user_id]['user_fea3'].iloc[0]
    test_dataset['user_fea2'] = (test_dataset['user_fea2']-test_dataset['user_fea2'].mean())/(test_dataset['user_fea2'].std()) # normalizing the year
    test_dataset['age_difference'] = abs(test_dataset['user_fea2'] - (2003 - test_dataset['year']))



    return test_dataset


def get_prediction(test_dataset):

    predictions = fm_model.predict(x = [test_dataset.user_fea2.values,
                                  test_dataset.user_fea3.values,
                                  test_dataset.year.values,
                                  test_dataset.age_difference.values,
                                  test_dataset.uid.values,
                                  test_dataset.last_seen.values,
                                  test_dataset.mid.values,
                                  np.concatenate(test_dataset.movie_genre.values).reshape(-1, 3)])

    # We add the prediction column
    test_dataset['prediction'] = predictions

    # Sort and then retreive the movie id
    top_recommendations = test_dataset.sort_values(by='prediction', ascending=False).head(25)
    recommended_movie_ids = top_recommendations['mid'].tolist()

    return recommended_movie_ids




In [None]:
# Create the output
import csv
from tqdm import tqdm
from tensorflow.keras.models import load_model

# open the file in the write mode
with open('/content/drive/MyDrive/Kaggle_competition/solution.csv', 'w', encoding='UTF8') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(['user_id', 'prediction'])

    # iterate over unique user_ids with tqdm
    for user_id in tqdm(test.user_id.unique(), desc="Processing Users"):
        test_set_user=create_test_dataset(user_id,ratings)
        relevant_items = get_prediction(test_set_user)
        list_relevants = ' '.join([str(elem) for elem in relevant_items])
        writer.writerow([str(user_id), list_relevants])