In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import tensorflow as tf
from tensorflow.keras.layers import Embedding
import gcsfs
from google.cloud import bigquery
client = bigquery.Client()

In [7]:
# Get the vocab list
tags_path = "gs://edc-dev/kubeflowpipelines-default/tfx_pipeline_output/node2vec_sports_syn_0_1_0/Transform/transform_graph/18561/transform_fn/assets/node_vocab_txt"
fs = gcsfs.GCSFileSystem(project="res-nbcupea-dev-ds-sandbox-001")

with fs.open(tags_path, "r") as fid:
    tags_list = fid.read().split("\n")
tags_list = [tag for tag in tags_list if tag != ""]
tags_list

['News',
 'TV',
 'Sports',
 'Talk',
 'covid-19',
 'new',
 'Entertainment',
 'Football',
 'Business & Finance',
 'coronavirus',
 'reports',
 '022ae9a1-d2ac-3238-b686-96c2a5ce26ba',
 'president',
 'news',
 'discuss',
 'trump',
 'joins',
 'today',
 'SportingEvent',
 'Comedy',
 'first',
 'mike',
 'show',
 'Subgenre:Talk',
 'Reality',
 'one',
 'us',
 'c04236ee-ca40-3a75-a008-230eaa805ba4',
 '39337ec8-062e-32ba-afc3-541adf683fce',
 'pandemic',
 'chris',
 'says',
 'nbc',
 'joe',
 'former',
 'cnbcs',
 'florio',
 '235b584f-ef32-38fe-8c51-e769f320257f',
 'day',
 'nbcs',
 'biden',
 '2020',
 'ddca41be-7eff-3441-96df-14b01bb41629',
 'world',
 'people',
 'watch',
 'talks',
 'get',
 'back',
 '2c1b4ec5-a425-32ed-a230-26638587fbec',
 'Soccer',
 'house',
 'ed68bf9f-dd4f-34b0-805f-142f3483c997',
 'season',
 'time',
 'look',
 'two',
 '38cfc6e9-47ba-33e0-a857-143193c2b7d6',
 'Drama',
 'c909dfb4-1532-3d0c-a532-0a5b3798efec',
 '700e1a8a-dcdd-3bea-a86b-7a03a595f26a',
 'a63d70f5-4640-348c-acc2-7b5c02ea5a65',
 

In [21]:
# Get the data mapping for title : keywords
query = """
    WITH  titles AS (
        SELECT LOWER(program_title) AS program_title, k AS keywords
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.synopsis_dylan_150tag_with_tokens_and_keywords`,
        UNNEST(keywords) k
    ),

    tok AS (
        SELECT ROW_NUMBER() OVER() row_number, tokens
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.node2vec_token_edc_dev`
    )

    SELECT a.program_title, ARRAY_AGG(DISTINCT a.keywords) AS keywords, ARRAY_AGG(b.row_number) AS index,
        "merlin" AS dataset
    FROM titles a
    JOIN tok b
    ON a.keywords = b.tokens
    GROUP BY a.program_title
"""

df_map = client.query(query=query, location="US").to_dataframe()
df_map

Unnamed: 0,program_title,keywords,index,dataset
0,tras el corazón verde,"[Village, Jungle, Affair, Kidnapping, Smugglin...","[43231, 56288, 4121, 3064, 608, 87103, 4166, 8...",merlin
1,bedard zamana kya jane,"[Crime drama, Crime]","[2076, 277]",merlin
2,unnaruge naan irundhal,"[Romantic comedy, Comedy]","[20, 3177]",merlin
3,hitler's circle of evil,"[Politics & Government, Military & War, Docume...","[3090, 6259, 231]",merlin
4,a special edition of the rachel maddow show,"[News, Politics & Government]","[3090, 1]",merlin
...,...,...,...,...
359155,millonario,"[Comedy drama, Comedy-Drama]","[1527, 2385]",merlin
359156,jane bleibt jane,"[Comedy drama, Comedy-Drama]","[1527, 2385]",merlin
359157,ucitelka,"[Comedy-Drama, Comedy drama]","[1527, 2385]",merlin
359158,los recuerdos,"[Comedy-Drama, Comedy drama]","[1527, 2385]",merlin


In [26]:
TITLES_QUERY_keywords = """
    CREATE TEMP FUNCTION strip_str_array(val ANY TYPE) AS ((
      SELECT ARRAY_AGG(DISTINCT TRIM(t))
      FROM UNNEST(val) t
      WHERE t != ""
    ));
    
    WITH titles_data AS (
        SELECT DISTINCT
            TitleDetails_title, 
            TitleType, 
            cid.content_ordinal_id,
            STRING_AGG(DISTINCT TitleTags, ',') AS TitleTags,
            STRING_AGG(DISTINCT TitleSubgenres, ',') AS TitleSubgenres
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.ContentMetadataView` cmv
        LEFT JOIN `res-nbcupea-dev-ds-sandbox-001.recsystem.ContentOrdinalId` cid
            ON LOWER(cmv.TitleDetails_title) = LOWER(cid.program_title)
        WHERE 
            TitleDetails_longsynopsis IS NOT NULL
            AND cid.content_ordinal_id IS NOT NULL
        GROUP BY 
            TitleDetails_title, 
            TitleType,
            cid.content_ordinal_id
        ),
    
    keywords_table AS (SELECT TitleDetails_title AS program_title, 
        strip_str_array(SPLIT(COALESCE(NULLIF(ARRAY_TO_STRING([TitleTags,TitleSubgenres], ","), ""), "movie"), ",")) AS keywords,
    FROM titles_data),
    
    titles AS (
        SELECT LOWER(program_title) AS program_title, k AS keywords
        FROM keywords_table,
        UNNEST(keywords) k
    ),

    
    tok AS (
        SELECT ROW_NUMBER() OVER() row_number, tokens
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.node2vec_token_edc_dev`
    )
    
    SELECT a.program_title, ARRAY_AGG(DISTINCT a.keywords) AS keywords, ARRAY_AGG(b.row_number) AS index,
        "cmv" AS dataset
    FROM titles a
    JOIN tok b
    ON a.keywords = b.tokens
    GROUP BY a.program_title
    
    
"""
df_map2 = client.query(query=TITLES_QUERY_keywords, location="US").to_dataframe()
df_map2

Unnamed: 0,program_title,keywords,index,dataset
0,reservation road,"[Daughter, Thriller, Fight for justice, Invest...","[2266, 376, 74052, 2457, 76494, 2143, 1803, 44...",cmv
1,this christmas,"[Amusing, Daughter, Love, Interracial relation...","[734, 2266, 792, 95414, 1527, 374, 76454, 20, ...",cmv
2,poker after dark,"[Sports, Poker, Sports non-event, Card Games, ...","[3, 1513, 130, 1687, 2, 19]",cmv
3,rumble fish,"[Girlfriend, 1980s, Pool hall, New York State,...","[3997, 2045, 114825, 76458, 3442, 1298, 3504, ...",cmv
4,dear santa,"[Single woman, Kitchen, Heartwarming, Amusing,...","[87051, 19782, 2808, 734, 792, 151951, 102138,...",cmv
...,...,...,...,...
2260,"carson kressley on ""rupaul's drag race"", emmy ...","[Entertainment, TV]","[7, 2]",cmv
2261,chia head,"[Entertainment, TV]","[7, 2]",cmv
2262,wacky glue,"[Entertainment, TV]","[7, 2]",cmv
2263,baby toupee,"[Entertainment, TV]","[7, 2]",cmv


In [68]:
df = pd.concat([df_map, df_map2])
df = df.drop_duplicates(subset="program_title", keep="last").reset_index(drop=True)
df

Unnamed: 0,program_title,keywords,index,dataset
0,tras el corazón verde,"[Village, Jungle, Affair, Kidnapping, Smugglin...","[43231, 56288, 4121, 3064, 608, 87103, 4166, 8...",merlin
1,bedard zamana kya jane,"[Crime drama, Crime]","[2076, 277]",merlin
2,unnaruge naan irundhal,"[Romantic comedy, Comedy]","[20, 3177]",merlin
3,hitler's circle of evil,"[Politics & Government, Military & War, Docume...","[3090, 6259, 231]",merlin
4,a special edition of the rachel maddow show,"[News, Politics & Government]","[3090, 1]",merlin
...,...,...,...,...
359541,"carson kressley on ""rupaul's drag race"", emmy ...","[Entertainment, TV]","[7, 2]",cmv
359542,chia head,"[Entertainment, TV]","[7, 2]",cmv
359543,wacky glue,"[Entertainment, TV]","[7, 2]",cmv
359544,baby toupee,"[Entertainment, TV]","[7, 2]",cmv


In [79]:
# Write titles to gcs bucket
fs = gcsfs.GCSFileSystem(project="res-nbcupea-dev-ds-sandbox-001")
content_titles = df["program_title"].to_list()
with fs.open("gs://edc-dev/content_titles_vocab", "w") as fid:
    fid.writelines("\n".join(content_titles))

In [64]:
# Make the embeddings
print("Loading model")
token_embed_url = "gs://edc-dev/kubeflowpipelines-default/tfx_pipeline_output/node2vec_sports_syn_0_1_1/Trainer/model/19130/serving_model_dir"
embed = tf.keras.models.load_model(token_embed_url).get_layer("Embedding")

100%|██████████| 360/360 [00:18<00:00, 19.66it/s]


In [73]:
batch_size = 1000
weights = []
# Run the inference
for i in tqdm(range(df.shape[0] // batch_size + 1)):
    start_index = i*batch_size
    end_index = min((i+1)*batch_size-1, df.shape[0])
    dataset = tf.ragged.constant(df.loc[start_index : end_index, "index"].values)
    y = embed(dataset)
    y = tf.math.reduce_mean(y, axis=1)
    weights.append(y)
weights = tf.concat(weights, axis=0)

100%|██████████| 360/360 [00:16<00:00, 21.95it/s]


In [115]:
np.savez("content_titles_embed_weights.npz", weights=weights.numpy())

In [165]:
title_embed = tf.keras.layers.Embedding(input_dim=df.shape[0]+1, output_dim=32,  trainable=False, 
                                        weights=[np.vstack([weights.numpy().mean(axis=0, keepdims=True), weights.numpy()])], 
                                        input_length=1, name="Embedding")

class TitleEmbed(tf.keras.models.Model):
    def __init__(self):
        super(TitleEmbed, self).__init__()
        self.embedding = title_embed
    def call(self, inputs):
        return self.embedding(inputs)
    

my_title_embed = TitleEmbed()
my_title_embed.predict(np.array([0, 1, 2]))
tf.keras.models.save_model(my_title_embed, "gs://edc-dev/content_title_embeddings_model")





INFO:tensorflow:Assets written to: gs://edc-dev/content_title_embeddings_model/assets


INFO:tensorflow:Assets written to: gs://edc-dev/content_title_embeddings_model/assets


In [144]:
# Test to see if model works
loaded_embed_model = tf.keras.models.load_model("gs://edc-dev/content_title_embeddings_model")
loaded_embed_model(np.array([[1,3, 2, 5, 3]]).T)





<tf.Tensor: shape=(5, 1, 32), dtype=float32, numpy=
array([[[ 0.12985791,  0.2832828 ,  0.50509584, -0.5418582 ,
          0.6364128 , -0.6614034 ,  0.9445423 , -0.18660134,
         -0.11616606, -0.10965705,  0.41396153,  0.96076554,
         -0.32583398, -0.31966394, -1.0485516 ,  0.37422   ,
         -0.28770265,  0.6234262 , -0.37877914, -0.10321245,
          0.24231797, -0.85194695, -0.6667286 ,  0.4043153 ,
         -0.6070905 , -1.0451252 ,  0.34790373,  0.16936222,
         -0.09113744, -0.76959187,  1.3112115 , -0.32723886]],

       [[ 0.1724453 , -0.6333292 , -0.6595114 ,  0.32417938,
         -1.2730378 ,  0.78159523,  0.11159571, -1.168035  ,
          0.8676192 , -0.3781639 ,  0.47728372, -0.27029607,
          0.15333055, -0.4790235 , -0.93399996, -0.21849036,
         -1.1306108 ,  0.0467308 , -0.7266465 , -0.4294022 ,
          0.5830752 ,  0.04852033, -0.45071438, -0.89420223,
          0.7965966 , -0.44303605,  0.3146656 , -0.9004598 ,
          0.09222154, -0.30251

In [167]:
loaded_embed_layer = loaded_embed_model.get_layer("Embedding")
loaded_embed_layer(np.array([[1,2,3,5]]).T)

<tf.Tensor: shape=(4, 1, 32), dtype=float32, numpy=
array([[[ 0.12985791,  0.2832828 ,  0.50509584, -0.5418582 ,
          0.6364128 , -0.6614034 ,  0.9445423 , -0.18660134,
         -0.11616606, -0.10965705,  0.41396153,  0.96076554,
         -0.32583398, -0.31966394, -1.0485516 ,  0.37422   ,
         -0.28770265,  0.6234262 , -0.37877914, -0.10321245,
          0.24231797, -0.85194695, -0.6667286 ,  0.4043153 ,
         -0.6070905 , -1.0451252 ,  0.34790373,  0.16936222,
         -0.09113744, -0.76959187,  1.3112115 , -0.32723886]],

       [[-0.32916927,  0.2519995 ,  0.02463201,  1.117959  ,
         -0.30467778, -1.3053746 ,  0.15034293,  1.0459994 ,
          0.5621707 , -0.11459035, -0.8424166 ,  0.79762775,
         -0.34043026,  1.1693814 , -0.47431216, -0.49556887,
          0.05014077,  0.00929913, -0.49126247, -0.8592369 ,
         -0.48495468, -0.1500876 ,  0.34918898, -0.21528494,
         -0.35344005, -0.71355534, -0.411538  ,  1.00323   ,
          0.8500975 , -0.93726