# Customer Model on GCP 

## Play with BQML

To re-create the BQML recommender in Notebook

Ref: https://cloud.google.com/architecture/building-a-recommendation-system-with-bigqueryml

From BQML Guide:
* https://cloud.google.com/bigquery-ml/docs/bigqueryml-mf-explicit-tutorialhttps://cloud.google.com/bigquery-ml/docs/bigqueryml-mf-explicit-tutorial
* https://cloud.google.com/bigquery-ml/docs/bigqueryml-mf-implicit-tutorialhttps://cloud.google.com/bigquery-ml/docs/bigqueryml-mf-implicit-tutorial


### Create dataset for data and model

In [None]:
from google.cloud import bigquery

client = bigquery.Client(location="US")

In [2]:
dataset = client.create_dataset("bqml_recommender")

In [3]:
%%bigquery

CREATE OR REPLACE TABLE bqml_recommender.ga_clientid_sample AS (
  SELECT *
  FROM `bigquery-public-data.google_analytics_sample.ga_sessions_2017*`
  LIMIT 100000);

UPDATE bqml_recommender.ga_clientid_sample
   SET clientId = fullvisitorId
   WHERE true;

Query complete after 0.05s: 100%|██████████| 1/1 [00:00<00:00, 1064.27query/s]


In [4]:
%%bigquery

CREATE OR REPLACE TABLE bqml_recommender.aggregate_web_stats AS (
  WITH
    durations AS (
      --calculate pageview durations
      SELECT
        CONCAT(clientId,'-',
             CAST(visitNumber AS STRING),'-',
             CAST(hitNumber AS STRING) ) AS visitorId_session_hit,
        LEAD(time, 1) OVER (
          PARTITION BY CONCAT(clientId,'-',CAST(visitNumber AS STRING))
          ORDER BY
          time ASC ) - time AS pageview_duration
      FROM
        `bqml_recommender.ga_clientid_sample`,
        UNNEST(hits) AS hit
    ),
    prodview_durations AS (
      --filter for product detail pages only
     SELECT
        CONCAT(clientId,'-',CAST(visitNumber AS STRING)) AS userId,
        productSKU AS itemId,
        IFNULL(dur.pageview_duration,
         1) AS pageview_duration,
      FROM
        `bqml_recommender.ga_clientid_sample` t,
        UNNEST(hits) AS hits,
        UNNEST(hits.product) AS hits_product
      JOIN
        durations dur
      ON
        CONCAT(clientId,'-',
               CAST(visitNumber AS STRING),'-',
               CAST(hitNumber AS STRING)) = dur.visitorId_session_hit
      WHERE
      eCommerceAction.action_type = "2"
    ),
    aggregate_web_stats AS(
      --sum pageview durations by userId, itemId
      SELECT
        userId,
        itemId,
        SUM(pageview_duration) AS session_duration
      FROM
        prodview_durations
      GROUP BY
        userId,
        itemId )
    SELECT
     *
   FROM
      aggregate_web_stats
);

Query complete after 0.00s: 100%|██████████| 9/9 [00:00<00:00, 4527.86query/s]                        


Query directly using `%%bigquery` magic return a Pandas DataFrame to workspace.

Or using the `df = client.query(sql).to_dataframe()` to return the DataFrame

In [5]:
%%bigquery

SELECT
 *
FROM
  bqml_recommender.aggregate_web_stats
LIMIT
  10;

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 366.22query/s]                          
Downloading: 100%|██████████| 10/10 [00:01<00:00,  7.68rows/s]


Unnamed: 0,userId,itemId,session_duration
0,2976039134355396575-6,GGOEAXXX0808,11658
1,9592560811612376734-1,GGOEAXXX0808,9462
2,7770623646473799793-2,GGOEAXXX0808,3303
3,1380176935199575290-3,GGOEAXXX0808,3
4,6585359463084064386-1,GGOEAXXX0808,16573
5,6411725969215935190-1,GGOEAXXX0808,130769
6,191473448043211052-2,GGOEAXXX0808,1858
7,9575515287846808484-3,GGOEAXXX0808,1624
8,394966243363906458-1,GGOEAXXX0808,11073
9,5717006107607349418-1,GGOEAXXX0808,213289


In [6]:
sql = """
SELECT *
FROM bqml_recommender.aggregate_web_stats
LIMIT 10;
"""

df = client.query(sql).to_dataframe()
df.head()

Unnamed: 0,userId,itemId,session_duration
0,2976039134355396575-6,GGOEAXXX0808,11658
1,9592560811612376734-1,GGOEAXXX0808,9462
2,7770623646473799793-2,GGOEAXXX0808,3303
3,1380176935199575290-3,GGOEAXXX0808,3
4,6585359463084064386-1,GGOEAXXX0808,16573
5,6411725969215935190-1,GGOEAXXX0808,130769
6,191473448043211052-2,GGOEAXXX0808,1858
7,9575515287846808484-3,GGOEAXXX0808,1624
8,394966243363906458-1,GGOEAXXX0808,11073
9,5717006107607349418-1,GGOEAXXX0808,213289


In [7]:
_

Unnamed: 0,userId,itemId,session_duration
0,2976039134355396575-6,GGOEAXXX0808,11658
1,9592560811612376734-1,GGOEAXXX0808,9462
2,7770623646473799793-2,GGOEAXXX0808,3303
3,1380176935199575290-3,GGOEAXXX0808,3
4,6585359463084064386-1,GGOEAXXX0808,16573
5,6411725969215935190-1,GGOEAXXX0808,130769
6,191473448043211052-2,GGOEAXXX0808,1858
7,9575515287846808484-3,GGOEAXXX0808,1624
8,394966243363906458-1,GGOEAXXX0808,11073
9,5717006107607349418-1,GGOEAXXX0808,213289


### Create and Train Model

In [8]:
%%bigquery

CREATE OR REPLACE MODEL bqml_recommender.retail_recommender
OPTIONS(model_type='matrix_factorization',
    user_col='userId',
    item_col='itemId',
    rating_col='session_duration',
    feedback_type='implicit'
    )
AS
SELECT * FROM bqml_recommender.aggregate_web_stats;

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1714.99query/s]                        


### Make Predictions

In [10]:
%%bigquery

DECLARE MY_USERID STRING DEFAULT "0824461277962362623-1";

SELECT
  *
FROM
  ML.RECOMMEND(MODEL `bqml_recommender.retail_recommender`,
  (SELECT MY_USERID as userID)
              )
ORDER BY predicted_session_duration_confidence DESC
LIMIT 5;

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 489.53query/s] 
Downloading: 100%|██████████| 5/5 [00:01<00:00,  4.03rows/s]


Unnamed: 0,predicted_session_duration_confidence,userId,itemId
0,37094.143599,0824461277962362623-1,GGOEYOCR077399
1,34792.471101,0824461277962362623-1,GGOEGETR014599
2,30125.491858,0824461277962362623-1,GGOEYOLR018699
3,28760.458195,0824461277962362623-1,GGOEGAAX0351
4,28178.903577,0824461277962362623-1,GGOEGAAX0568


## Play with TF2 and Keras

Ref: 
* https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommenderhttps://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
* https://nipunbatra.github.io/blog/ml/2017/12/18/recommend-keras.htmlhttps://nipunbatra.github.io/blog/ml/2017/12/18/recommend-keras.html



### Matrix Factorisation With Keras And Gradient Descent

In [None]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}


# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)


# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, movie_vector])

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))

### Deep Learning With Keras


In [None]:
# Setup variables
user_embedding_size = 20
movie_embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=user_embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([user_embedding_size])(user_embedding)
movie_vector = Reshape([movie_embedding_size])(movie_embedding)

# Concatenate the reshaped embedding layers
concat = Concatenate()([user_vector, movie_vector])

# Combine with dense layers
dense = Dense(256)(concat)
y = Dense(1)(dense)

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Deep Learning: {:.4f} RMSE'.format(rmse))


## Play with Tensorflow Recommenders

Ref:
* https://blog.tensorflow.org/2020/09/introducing-tensorflow-recommenders.htmlhttps://blog.tensorflow.org/2020/09/introducing-tensorflow-recommenders.html
 * https://blog.tensorflow.org/2020/11/tensorflow-recommenders-scalable-retrieval-feature-interaction-modelling.htmlhttps://blog.tensorflow.org/2020/11/tensorflow-recommenders-scalable-retrieval-feature-interaction-modelling.html
 * https://www.tensorflow.org/recommenders/examples/basic_retrievalhttps://www.tensorflow.org/recommenders/examples/basic_retrieval

In [13]:
!pip install --user tensorflow_recommenders

[0mCollecting tensorflow_recommenders
  Using cached tensorflow_recommenders-0.6.0-py3-none-any.whl (85 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
[0mInstalling collected packages: google-auth-oauthlib, tensorflow_recommenders
[0mSuccessfully installed google-auth-oauthlib-0.4.6 tensorflow_recommenders-0.6.0
[0m

In [None]:
import tensorflow as tf
 
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs


In [None]:
# Ratings data.
ratings = tfds.load("movie_lens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movie_lens/100k-movies", split="train")

In [None]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = movies.map(lambda x: x["movie_title"])

In [None]:
class TwoTowerMovielensModel(tfrs.Model):
  """MovieLens prediction model."""
 
  def __init__(self):
    # The `__init__` method sets up the model architecture.
    super().__init__()
 
    # How large the representation vectors are for inputs: larger vectors make
    # for a more expressive model but may cause over-fitting.
    embedding_dim = 32
    num_unique_users = 1000
    num_unique_movies = 1700
    eval_batch_size = 128

    # Set up user and movie representations.
    self.user_model = tf.keras.Sequential([
      # We first turn the raw user ids into contiguous integers by looking them
      # up in a vocabulary.
      tf.keras.layers.experimental.preprocessing.StringLookup(
          max_tokens=num_unique_users),
      # We then map the result into embedding vectors.
      tf.keras.layers.Embedding(num_unique_users, embedding_dim)
    ])
    
    self.movie_model = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
          max_tokens=num_unique_movies),
      tf.keras.layers.Embedding(num_unique_movies, embedding_dim)
    ])
    
    # The `Task` objects has two purposes: (1) it computes the loss and (2)
    # keeps track of metrics.
    self.task = tfrs.tasks.Retrieval(
        # In this case, our metrics are top-k metrics: given a user and a known
        # watched movie, how highly would the model rank the true movie out of
        # all possible movies?
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(eval_batch_size).map(self.movie_model)
        )
    )
    
    def compute_loss(self, features, training=False):
        # The `compute_loss` method determines how loss is computed.

        # Compute user and item embeddings.
        user_embeddings = self.user_model(features["user_id"])
        movie_embeddings = self.movie_model(features["movie_title"])

        # Pass them into the task to get the resulting loss. The lower the loss is, the
        # better the model is at telling apart true watches from watches that did
        # not happen in the training data.
        return self.task(user_embeddings, movie_embeddings)    
    

In [None]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
 
model.fit(ratings.batch(4096), verbose=False)

In [None]:
index = tfrs.layers.ann.BruteForce(model.user_model)
index.index(movies.batch(100).map(model.movie_model), movies)
 
# Get recommendations.
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")
