In [1]:
# Load movies from HDFS, generate embeddings of movie titles with BERT, then save embeddings to
# redis and HDFS.

import os
import subprocess
from time import localtime, strftime

import numpy as np
import redis

import tensorflow_hub as hub
import tensorflow_text as text

2023-09-26 08:37:35.328427: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-26 08:37:35.331505: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 08:37:35.393030: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 08:37:35.395819: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
HDFS_PATH="hdfs://sparrow-recsys-dev:8020/sparrow_recsys/"
HDFS_PATH_MOVIE_EMBEDDINGS=HDFS_PATH+"movie-embeddings/"

REDIS_SERVER="sparrow-recsys-dev"
REDIS_PORT=6379
REDIS_PASSWD="123456"
REDIS_KEY_MOVIE_EMBEDDING_VERSION="sparrow_recsys:version:me"
REDIS_KEY_PREFIX_MOVIE_EMBEDDING="sparrow_recsys:me"

In [3]:
# load movies from HDFS

cmd = "hdfs dfs -cat " + HDFS_PATH + "movies/*/part-*"
cat_hdfs_movies = subprocess.Popen(cmd.split(" "), stdout=subprocess.PIPE)
print(cmd)

movies = []
for line in cat_hdfs_movies.stdout:
    movie_info = line.strip().split(b"\t")
    if len(movie_info) == 3:
        movies.append(movie_info)

movies = np.array(movies)
print(f"HDFS movies count: {len(movies)}, first: {movies[0]}")

if len(movies) == 0:
    exit(1)

hdfs dfs -cat hdfs://sparrow-recsys-dev:8020/sparrow_recsys/movies/*/part-*
HDFS movies count: 983, first: [b'0' b'title' b'genres']


In [4]:
# get embeddings
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/2"

preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
encoder_model = hub.KerasLayer(tfhub_handle_encoder)

In [5]:
movie_ids = list(map(lambda x: int(x.decode('utf-8')), movies[:, 0]))
movie_titles = movies[:, 1]
text_preprocessed = preprocess_model(movie_titles)
text_results = encoder_model(text_preprocessed)

print("text_results keys：", text_results.keys())
print("default shape: ", text_results["default"].shape)
print("pooled_output shape: ", text_results["pooled_output"].shape)
print("sequence_output shape: ", text_results["sequence_output"].shape)
print("encoder_outputs shape: ", len(text_results["encoder_outputs"]))

text_results keys： dict_keys(['pooled_output', 'sequence_output', 'encoder_outputs', 'default'])
default shape:  (983, 128)
pooled_output shape:  (983, 128)
sequence_output shape:  (983, 128, 128)
encoder_outputs shape:  4


In [6]:
movie_embeddings = list(map(lambda x: ','.join(list(map(lambda y: str(y), x.numpy()))), text_results["pooled_output"]))
movie_embeddings = list(zip(movie_ids, movie_embeddings))

# remove duplicates
movie_embeddings = dict(sorted(dict(movie_embeddings).items()))
movie_embeddings = list(movie_embeddings.items())

print(f"Movie embedding sample: {movie_embeddings[0]}")

Movie embedding sample: (0, '0.033289623,0.93584985,-0.78879136,-0.8479012,-0.45617646,0.0041523413,0.052315157,0.9258627,0.065586776,-0.2704935,-0.99555856,0.22534114,0.99763864,0.9903045,-0.44187042,0.9429301,-0.5767987,-0.951814,0.9848147,-0.99357456,0.42498532,0.982719,-0.15330869,0.731322,0.9675594,0.73747975,-0.99981576,0.1222808,-0.47955528,0.9926778,-0.011694358,0.89508057,-0.7799539,-0.68458784,-0.9995591,0.7260162,0.081243195,0.06644778,-0.5784699,-0.99609244,-0.65765274,0.54004353,0.9745671,0.94613683,0.3980487,0.99889624,0.7613979,-0.98122907,0.00065292034,0.9893965,-0.83787817,-0.8747784,-0.15984374,-0.93225974,0.88025576,-0.03661433,0.86575645,-0.78201556,0.99807733,-0.9750251,0.99824923,-0.09797347,0.0053915936,-0.99980783,0.10235674,0.3789161,-0.9501642,0.9269745,-0.17737952,-0.89949137,0.9496383,0.9959522,0.97815263,0.13525155,0.90568894,0.7776421,0.009329109,-0.71928215,0.076657616,0.011706317,0.34641483,0.36574164,0.99943656,0.87314326,-0.42278737,0.8296881,-0.997685

In [7]:
# save to HDFS
tmp_file_name = 'movie-embeddings.csv'

if os.path.isfile(tmp_file_name):
    os.remove(tmp_file_name)

with open(tmp_file_name, 'a') as tmp_file:
    list(map(lambda x: tmp_file.write(f"{x[0]}\t{x[1]}\n"), movie_embeddings))

if os.path.isfile(tmp_file_name):
    subprocess.Popen(["hdfs", "dfs", "-rm", "-r", HDFS_PATH_MOVIE_EMBEDDINGS], stdout=subprocess.PIPE).communicate()
    subprocess.Popen(["hdfs", "dfs", "-mkdir", "-p", f"{HDFS_PATH_MOVIE_EMBEDDINGS}0000/"], stdout=subprocess.PIPE).communicate()
    subprocess.Popen(["hdfs", "dfs", "-put", f"./{tmp_file_name}", f"{HDFS_PATH_MOVIE_EMBEDDINGS}0000/part-0"], stdout=subprocess.PIPE).communicate()
    os.remove(tmp_file_name)
    print(f"Movie embeddings is uploaded to HDFS: {HDFS_PATH_MOVIE_EMBEDDINGS}")

Movie embeddings is uploaded to HDFS: hdfs://sparrow-recsys-dev:8020/sparrow_recsys/movie-embeddings/


In [8]:
# save to redis
version=strftime("%Y%m%d%H%M%S", localtime())

movie_embeddings_redis = list(map(
    lambda x: (f"{REDIS_KEY_PREFIX_MOVIE_EMBEDDING}:{version}:{x[0]}", x[1]),
    movie_embeddings))

r = redis.Redis(host=REDIS_SERVER, port=REDIS_PORT, password=REDIS_PASSWD)
r.mset(dict(movie_embeddings_redis))
r.set(REDIS_KEY_MOVIE_EMBEDDING_VERSION, version)

print(f"Movie embedding version is updated to: {version}")

Movie embedding version is updated to: 20230926083751
