In [18]:
import os
import numpy as np
import pandas as pd
import scipy.sparse
from scipy.sparse import coo_matrix, csc_matrix
from google.cloud import bigquery

client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

Client creating using default project: res-nbcupea-dev-ds-sandbox-001


# Load data

In [63]:
query = """
WITH titles AS (
    SELECT DISTINCT COALESCE(InSeasonSeries_Id, TitleId) as InSeasonSeries_Id,
        TitleDetails_LongSynopsis,
        InSeasonSeries_Tags,
        TitleTags,
        TitleSubgenres,
        TitleType
    FROM `{GOOGLE_CLOUD_PROJECT}.recsystem.ContentMetadataView`
),
melted AS (
    SELECT DISTINCT InSeasonSeries_Id,
        TitleDetails_LongSynopsis,
        TitleType,
        TRIM(tags) as tags
    FROM (
            SELECT DISTINCT InSeasonSeries_Id,
                TitleDetails_LongSynopsis,
                TitleType,
                tags
            FROM titles
                CROSS JOIN UNNEST(SPLIT(InSeasonSeries_Tags, ',')) tags
            UNION ALL
            SELECT DISTINCT InSeasonSeries_Id,
                TitleDetails_LongSynopsis,
                TitleType,
                tags
            FROM titles
                CROSS JOIN UNNEST(SPLIt(TitleSubgenres, ',')) tags
            UNION ALL
            SELECT DISTINCT InSeasonSeries_Id,
                TitleDetails_LongSynopsis,
                TitleType,
                tags
            FROM titles
                CROSS JOIN UNNEST(SPLIt(TitleTags, ',')) tags
        )
    WHERE tags <> ''
),
-- get tokens from long synopsis
token_table AS (
    SELECT COALESCE(InSeasonSeries_Id, TitleId) AS InSeasonSeries_Id,
        SPLIT(
            REGEXP_REPLACE(
                LOWER(TitleDetails_LongSynopsis),
                '[^a-zA-Z0-9 -]',
                ''
            ),
            ' '
        ) AS tokens,
        -- filter out non-alphabetical characters
    FROM `{GOOGLE_CLOUD_PROJECT}.recsystem.ContentMetadataView`
),
-- unnest token
token_clean AS (
    SELECT InSeasonSeries_Id,
        token,
        COUNT(*) AS token_count
    FROM token_table
        CROSS JOIN UNNEST(tokens) token
    GROUP BY InSeasonSeries_Id,
        token
)
SELECT distinct InSeasonSeries_Id,
    tags AS token,
    1 AS token_count
FROM melted
UNION ALL
(
    SELECT InSeasonSeries_Id,
        token,
        token_count
    FROM token_clean t
        LEFT OUTER JOIN `{GOOGLE_CLOUD_PROJECT}.recsystem.stop_words_en_sp` stop ON stop.string_field_0 = t.token
    WHERE stop.string_field_0 IS NULL
)
LIMIT 10

""".format(GOOGLE_CLOUD_PROJECT="res-nbcupea-dev-ds-sandbox-001")

df = client.query(query, location="US").to_dataframe()
df

Unnamed: 0,InSeasonSeries_Id,token,token_count
0,d24e3541-9e1c-3cbe-bac6-1719db78aa20,announced,3
1,b6ccd2c8-ffb1-3798-a82f-cb9603d8ef8c,charlie,6
2,b6ccd2c8-ffb1-3798-a82f-cb9603d8ef8c,events,6
3,b6ccd2c8-ffb1-3798-a82f-cb9603d8ef8c,think,6
4,c599fdb4-5eca-35ad-beed-b216622f3ce0,gm,3
5,c599fdb4-5eca-35ad-beed-b216622f3ce0,thomas,3
6,c599fdb4-5eca-35ad-beed-b216622f3ce0,expectations,3
7,fb1f5210-88fe-3710-b235-b334f6a4b443,weeks,3
8,fb1f5210-88fe-3710-b235-b334f6a4b443,warning,3
9,fb1f5210-88fe-3710-b235-b334f6a4b443,showed,3


In [65]:
replace_dict = {}
current_index = 0
df_new = df.copy()
for i in np.concatenate([df_new["InSeasonSeries_Id"].values, df_new["token"]]):
    if i in replace_dict:
        continue
    else:
        replace_dict[i] = current_index
        current_index += 1
df_new["InSeasonSeries_Id"] = df_new["InSeasonSeries_Id"].replace(replace_dict)
df_new["token"] = df_new["token"].replace(replace_dict)
df_new.to_csv("/home/jupyter/Exploration/data/adj_matrix_real2.csv", index=False)
df_new

Unnamed: 0,InSeasonSeries_Id,token,token_count
0,0,4,3
1,1,5,6
2,1,6,6
3,1,7,6
4,2,8,3
5,2,9,3
6,2,10,3
7,3,11,3
8,3,12,3
9,3,13,3


In [68]:
from tensorflow.python.framework import sparse_tensor
def my_map_values(op, *args):
    """
    Applies the `op` to the `.values` tensor of one or more `SparseTensor`s.
    For tensorflow versions below 2.4. For versions above, use function
    `tf.sparse.map_values`.
    """
    return sparse_tensor.SparseTensor(args[0].indices, 
                                      op(*[a.values for a in args]),
                                      args[0].dense_shape)

In [69]:
import pandas as pd
import tensorflow as tf
from tensorflow.python.ops.linalg.sparse import sparse_csr_matrix_ops
tf.random.set_seed(42)

def tf_sparse_multiply(a: tf.SparseTensor, b: tf.SparseTensor):
    a_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
        a.indices, a.values, a.dense_shape)

    b_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
        b.indices, b.values, b.dense_shape)

    c_sm = sparse_csr_matrix_ops.sparse_matrix_sparse_mat_mul(
        a=a_sm, b=b_sm, type=tf.float32)

    c = sparse_csr_matrix_ops.csr_sparse_matrix_to_sparse_tensor(
        c_sm, tf.float32)

    return tf.SparseTensor(
        c.indices, c.values, dense_shape=c.dense_shape)

def sample_from_sparse(W_sample):
    """Take a sample given unnormalized weight matrix."""
    # Normalize each row
    row_sum = tf.sparse.reduce_sum(W_sample, axis=1, keepdims=True)
    W_sample = W_sample.__div__(row_sum)
    W_sample = tf.sparse.reorder(W_sample) # Make sure the indices are sorted
    
    # uniform_sample = tf.random.uniform((num_nodes, 1), minval=0, maxval=1)
    cdf = tf.map_fn(lambda x: my_map_values(
        lambda y: tf.cumsum(y) - tf.random.uniform((1,), minval=0, maxval=1), x), 
                    W_sample) # map to each row
    is_pos = tf.greater_equal(cdf.values, 0)
    cdf_sample = tf.sparse.retain(cdf, is_pos)
    cdf_sample = tf.sparse.reorder(cdf_sample)
    
    # Materialize the samples: Take the first nonzero col of each row
    #out_sample = tf.constant([list(item)[0][1] for _, item in \
    #    itertools.groupby(cdf_sample.indices.numpy(), lambda x: x[0])])
    index = cdf_sample.indices # assuming sorted already
    indices = tf.concat([tf.constant([1], dtype="int64"), 
                         index[1:, 0] - index[:-1, 0]], axis=0)
    out_sample = index[:, 1][tf.greater(indices, 0)]
    
    return W_sample, cdf, cdf_sample, out_sample
    
def _random_walk_sampling_step_tf(W, s0, s1, p, q):
    print(f"W nnz: {W.indices.shape[0]}")
    # Get dimension
    num_nodes = W.shape[0]

    # alpha_1 / P
    P = tf.sparse.SparseTensor(tf.cast(tf.stack([tf.range(num_nodes, dtype="int64"), s0], axis=1), dtype="int64"), 
                               tf.ones(num_nodes), 
                               dense_shape=(num_nodes, num_nodes))
    
    # alpha_2 / R
    A_0 = tf.sparse.SparseTensor(W.indices, tf.ones(W.indices.shape[0], dtype="float32"), dense_shape=W.shape)
    A_i_1 = tf_sparse_multiply(P, A_0)

    I = tf.sparse.SparseTensor(tf.cast(tf.stack([tf.range(num_nodes, dtype="int64"), s1], axis=1), "int64"), 
                               tf.ones(num_nodes), 
                               dense_shape=(num_nodes, num_nodes)) # permutation matrix
    A_i = tf_sparse_multiply(I, A_0)
    
    #print("A_0:", tf.sparse.to_dense(A_0))
    #print("A_i_1:", tf.sparse.to_dense(A_i_1))
    #print("A_i:", tf.sparse.to_dense(A_i))
    #print("P:", tf.sparse.to_dense(P))
    ## intersection
    R = tf.sparse.minimum(A_i_1, A_i)
    is_nonzero = tf.not_equal(R.values, 0)
    R = tf.sparse.retain(R, is_nonzero)
    #print("R:", tf.sparse.to_dense(R))

    # alpha3 / Q
    Q = tf.sparse.add(A_i, P.__mul__(tf.constant([-1], dtype="float32")))
    Q = tf.sparse.add(Q, R.__mul__(tf.constant([-1], dtype="float32")))
    is_nonzero = tf.not_equal(Q.values, 0)
    Q = tf.sparse.retain(Q, is_nonzero)
    
    #print("Q:", tf.sparse.to_dense(Q))
    
    print(f"A_i nnz: {A_i.indices.shape[0]}")
    print(f"P nnz: {P.indices.shape[0]}")
    print(f"R nnz: {R.indices.shape[0]}")
    print(f"Q nnz: {Q.indices.shape[0]}")

    # Combine to get the final weight
    W_sample = tf.sparse.add(P.__mul__(tf.constant([1/p], dtype="float32")), R)
    W_sample = tf.sparse.add(W_sample, Q.__mul__(tf.constant([1/q], dtype="float32")))
    is_nonzero = tf.not_equal(W_sample.values, 0)
    W_sample = tf.sparse.retain(W_sample, is_nonzero)
    W_sample = tf.sparse.reorder(W_sample)

    # Make sure the orders of indices are the same
    W_new = tf_sparse_multiply(I, tf.cast(tf.sparse.reorder(W), dtype="float32"))
    W_new = tf.sparse.reorder(W_new)
    print(f"W_new nnz: {W_new.indices.shape[0]}")


    # Multiply the weights by creating a new sparse matrix
    W_sample = tf.sparse.SparseTensor(W_sample.indices, tf.multiply(W_sample.values, W_new.values), 
                                      dense_shape=W_sample.shape)
    
    print("W_sample:", tf.sparse.to_dense(W_sample))
    # Taking samples from the sparse matrix
    W_sample, cdf, cdf_sample, s_next = sample_from_sparse(W_sample)
    
    return W_sample, cdf, cdf_sample, s_next

p = 0.2
q = 0.8
df = pd.read_csv("/home/jupyter/Exploration/data/adj_matrix_real2.csv")
num_nodes = max(df[["InSeasonSeries_Id", "token"]].max().values) + 1
W = tf.sparse.SparseTensor(df[["InSeasonSeries_Id", "token"]].values, 1./df["token_count"].values, 
                           dense_shape=(num_nodes, num_nodes))
W = tf.sparse.reorder(W)
W = tf.cast(W, "float32")

if True:
    W = tf.sparse.maximum(W, tf.sparse.transpose(W))
else:
    indices = tf.sparse.to_dense(tf.sets.difference([tf.range(W.shape[0], dtype="int64")], 
                                                    [tf.unique(W.indices[:, 0]).y]))
    indices = tf.transpose(tf.concat([indices, indices], axis=0))
    terms = tf.sparse.SparseTensor(indices, tf.ones(indices.shape[0]), dense_shape=(num_nodes, num_nodes))
    W = tf.sparse.add(W, terms)
    
W = tf.sparse.reorder(W)

# First step
s0 = tf.range(W.shape[0], dtype="int64")
W_sample_1, cdf_1, cdf_sample_1, s1 = sample_from_sparse(W)
print("s0:", s0)
print("s1:", s1)

W_sample, cdf, cdf_sample, s_next = _random_walk_sampling_step_tf(W, s0, s1, p, q)
print("s_next:", s_next)
#print("W_sample:", tf.sparse.to_dense(W_sample))
#print("cdf:", tf.sparse.to_dense(cdf))
#print("masked cdf:", tf.sparse.to_dense(cdf_sample))
#print("samples:", out_sample)

s0: tf.Tensor([ 0  1  2  3  4  5  6  7  8  9 10 11 12 13], shape=(14,), dtype=int64)
s1: tf.Tensor([ 4  7 10 13  0  1  1  1  2  2  2  3  3  3], shape=(14,), dtype=int64)
W nnz: 20
A_i nnz: 32
P nnz: 14
R nnz: 0
Q nnz: 18
W_new nnz: 32
W_sample: tf.Tensor(
[[1.6666667  0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.8333334  0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         1.6666667  0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         1.6666667  0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         1.6666667  0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.    

In [45]:
tf.sparse.to_dense(W)

<tf.Tensor: shape=(13, 13), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)>

In [62]:
# Take several steps
p = 0.2
q = 0.8
walk_length = 3
df = pd.read_csv("/home/jupyter/Exploration/data/adj_matrix.csv")
W = tf.sparse.SparseTensor(df[["InSeasonSeries_Id", "cast"]].values, df["weight"].values, dense_shape=(5,5))

def sample_1_iteration(W, p, q, walk_length=80):
    W = tf.cast(W, "float32")

    # First step
    s0 = tf.range(W.shape[0], dtype="int64")
    _, _, _, s1 = sample_from_sparse(W)
    S = [s0, s1]

    for i in range(walk_length - 1):
        _, _, cdf_sample, s_next = _random_walk_sampling_step_tf(W, S[-2], S[-1], p, q)
        #print("masked cdf:", tf.sparse.to_dense(cdf_sample))
        #print(s_next)
        S.append(s_next)
        
    return S

S = sample_1_iteration(W, p, q, walk_length=walk_length)
S = tf.transpose(tf.stack(S, axis=0))
S

<tf.Tensor: shape=(5, 4), dtype=int64, numpy=
array([[0, 1, 2, 1],
       [1, 4, 0, 4],
       [2, 3, 2, 0],
       [3, 4, 3, 4],
       [4, 0, 4, 3]])>

In [73]:
tf.unique(W.indices[:, 0]).y

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([0, 2, 3, 4, 5, 9])>

In [77]:
indices = tf.sparse.to_dense(tf.sets.difference([tf.range(W.shape[0], dtype="int64")], 
                                                [tf.unique(W.indices[:, 0]).y]))
indices = tf.transpose(tf.concat([indices, indices], axis=0))
indices

<tf.Tensor: shape=(4, 2), dtype=int64, numpy=
array([[1, 1],
       [6, 6],
       [7, 7],
       [8, 8]])>

In [261]:
# Writing to TFRecords, each row at a time (thus allowing batches)
ds = (tf.data.Dataset.from_tensor_slices(S)
      .map(tf.io.serialize_tensor))
writer = tf.data.experimental.TFRecordWriter('./data/temp.tfrecord')
writer.write(ds)

In [265]:
# Read from file
parse_tensor_f = lambda x: tf.io.parse_tensor(x, tf.int64)
ds2 = (tf.data.TFRecordDataset('./data/temp.tfrecord')
       .map(parse_tensor_f))
for x2 in ds2:
    tf.print(x2)


[0 2 0 ... 0 2 0]
[1 2 1 ... 3 0 3]
[2 0 2 ... 1 4 1]
[3 0 3 ... 2 3 2]
[4 1 0 ... 1 0 3]


In [68]:
df = pd.read_csv("/home/jupyter/Exploration/data/adj_matrix.csv")
W = coo_matrix((df["weight"].values, (df["content"].values, df["cast"].values)), dtype=np.float32, shape=(5,5))
W.toarray()

array([[0., 1., 1., 1., 1.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 1., 0.]], dtype=float32)

In [115]:
s0 = np.array([3, 5, 2, 1, 4])-1
s1 = np.array([2, 4, 1, 5, 3])-1
num_nodes = W.shape[0]
# alpha_1 / P
values = np.ones(num_nodes)
indices = (np.arange(num_nodes), s0)
P = coo_matrix((values, indices))
# alpha_2 / R
A_i = W.copy().tocsc()
A_i.data[:] = 1
R = A_i[s1, :].multiply(A_i[s0, :])
# alpha_3 / Q
Q = A_i[s1, :] - P - R

In [117]:
Q.toarray()

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [109]:
(A_i[s0, :] * A_i[s1, :]).toarray()

array([[3., 1., 2., 1., 2.],
       [3., 1., 2., 1., 2.],
       [2., 2., 2., 2., 2.],
       [3., 3., 2., 3., 2.],
       [2., 2., 2., 2., 2.]], dtype=float32)

In [112]:
A_i[s0, :]

<5x5 sparse matrix of type '<class 'numpy.float32'>'
	with 16 stored elements in Compressed Sparse Column format>

In [106]:
A_i[s1, :].toarray()

array([[1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1.],
       [0., 1., 1., 1., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 0., 1., 0.]], dtype=float32)

In [114]:
R.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 0.]], dtype=float32)

In [6]:
import numpy as np
import tensorflow as tf
indices = np.array([[1, 2], [0, 2], [1, 2], [3, 0]])
values = np.ones(indices.shape[0])
out = tf.sparse.SparseTensor(indices, values, dense_shape=(5, 5))
out = tf.sparse.reorder(out)
tf.sparse.to_dense(out)

InvalidArgumentError: indices[2] = [1,2] is repeated [Op:SparseToDense]

In [7]:
path = "gs://edc-dev/kubeflowpipelines-default/asdf.json"
import json
json.dump({"asdf":1}, open(path, "w"))

FileNotFoundError: [Errno 2] No such file or directory: 'gs://edc-dev/kubeflowpipelines-default/asdf.json'

In [2]:
from google.cloud import storage
import json
client = storage.client.Client(project="res-nbcupea-dev-ds-sandbox-001")
bucket = client.get_bucket("edc-dev")
#blob = bucket.blob("output_pandas.json")
#blob.upload_from_string(data=json.dumps({"asdf":1}), 
#                       content_type="application/json")

In [30]:
set(map(tuple, np.random.randint(0, 10, size=(10, 2)).tolist()))

{(1, 1), (1, 4), (2, 8), (4, 1), (4, 8), (5, 1), (6, 4), (7, 8)}

In [4]:
# Grab the databack
blob = bucket.get_blob("output_pandas.json")
pandas_data = json.loads(blob.download_as_string())

In [8]:
import pandas as pd
df = pd.DataFrame(pandas_data)
df["InSeasonSeries_Id"] = df["InSeasonSeries_Id"].astype(int)
df["token"] = df["token"].astype(int)
df

Unnamed: 0,InSeasonSeries_Id,token,weight
0,17488,25580,0.333333
1,7750,5176,0.058824
2,39928,826,0.250000
3,34999,147,0.166667
4,384,9811,0.125000
...,...,...,...
1598144,26774,1086,0.333333
1598145,61914,19225,1.000000
1598146,39324,14,0.333333
1598147,26354,1392,0.250000


In [10]:
len(set(map(tuple, df[["InSeasonSeries_Id", "token"]].values.tolist())))

1596748

In [50]:
df.loc[df[["InSeasonSeries_Id", "token"]].duplicated(), :]

Unnamed: 0,InSeasonSeries_Id,token,weight
42008,49279,0,1.000000
79948,39560,0,0.333333
102677,14315,0,0.500000
105630,23411,0,1.000000
105973,50743,0,1.000000
...,...,...,...
1596100,20241,0,1.000000
1596860,68430,0,0.333333
1597160,39315,0,1.000000
1597694,13209,0,1.000000


In [51]:
out[39560]

'76f1d4f4-738d-3225-b6ae-a26212528a9a'

In [17]:
df.describe()

Unnamed: 0,InSeasonSeries_Id,token,weight
count,1598149.0,1598149.0,1598149.0
mean,28357.69,16989.9,0.4039867
std,22988.42,33224.74,0.3140326
min,11.0,0.0,5.413306e-05
25%,8143.0,383.0,0.2
50%,24606.0,1750.0,0.3333333
75%,45731.0,8282.0,0.5
max,154072.0,154098.0,1.0


In [78]:
query = """WITH titles AS (
    SELECT DISTINCT COALESCE(InSeasonSeries_Id, TitleId) as InSeasonSeries_Id,
        TitleDetails_LongSynopsis,
        InSeasonSeries_Tags,
        TitleTags,
        TitleSubgenres,
        TitleType
    FROM `res-nbcupea-dev-ds-sandbox-001.recsystem.ContentMetadataView`
),
melted AS (
    SELECT DISTINCT InSeasonSeries_Id,
        TitleDetails_LongSynopsis,
        TitleType,
        TRIM(tags) as tags
    FROM (
            SELECT DISTINCT InSeasonSeries_Id,
                TitleDetails_LongSynopsis,
                TitleType,
                tags
            FROM titles
                CROSS JOIN UNNEST(SPLIT(InSeasonSeries_Tags, ',')) tags
            UNION ALL
            SELECT DISTINCT InSeasonSeries_Id,
                TitleDetails_LongSynopsis,
                TitleType,
                tags
            FROM titles
                CROSS JOIN UNNEST(SPLIT(TitleSubgenres, ',')) tags
            UNION ALL
            SELECT DISTINCT InSeasonSeries_Id,
                TitleDetails_LongSynopsis,
                TitleType,
                tags
            FROM titles
                CROSS JOIN UNNEST(SPLIT(TitleTags, ',')) tags
        )
    WHERE tags <> ''
),
-- get tokens from long synopsis
token_table AS (
    SELECT COALESCE(InSeasonSeries_Id, TitleId) AS InSeasonSeries_Id,
        SPLIT(
            REGEXP_REPLACE(
                LOWER(TitleDetails_LongSynopsis),
                '[^a-zA-Z0-9 -]',
                ''
            ),
            ' '
        ) AS tokens,
        -- filter out non-alphabetical characters
    FROM `res-nbcupea-dev-ds-sandbox-001.recsystem.ContentMetadataView`
),
-- unnest token
token_clean AS (
    SELECT InSeasonSeries_Id,
        token,
        COUNT(*) AS token_count
    FROM token_table
        CROSS JOIN UNNEST(tokens) token
    GROUP BY InSeasonSeries_Id,
        token
),
intermediate_result AS (
    SELECT distinct InSeasonSeries_Id,
        RTRIM(LTRIM(tags)) AS token,
        1 AS token_count
    FROM melted
    UNION ALL
    (
        SELECT InSeasonSeries_Id,
            token,
            token_count
        FROM token_clean t
            LEFT OUTER JOIN `res-nbcupea-dev-ds-sandbox-001.recsystem.stop_words_en_sp` stop ON stop.string_field_0 = t.token
        WHERE stop.string_field_0 IS NULL
    )
)
SELECT InSeasonSeries_Id,
    token,
    SUM(token_count) AS token_count
FROM intermediate_result
GROUP BY InSeasonSeries_Id, token 
HAVING token NOT IN ("", " ")
"""

dff = client.query(query=query, location="US").to_dataframe()
dff

Unnamed: 0,InSeasonSeries_Id,token,token_count
0,6f19c28a-0dd0-319a-a0d3-6dcce227c60b,ethel,120
1,6f19c28a-0dd0-319a-a0d3-6dcce227c60b,stuns,32
2,6f19c28a-0dd0-319a-a0d3-6dcce227c60b,anna,184
3,6f19c28a-0dd0-319a-a0d3-6dcce227c60b,vera,54
4,6f19c28a-0dd0-319a-a0d3-6dcce227c60b,old,78
...,...,...,...
1595694,a80824e7-b8ec-351a-9ed2-ff99f2c783e8,difunde,14
1595695,a3f4a4f6-1db7-3bcd-9fe3-e152108d875b,pose,14
1595696,434fb351-b35a-342a-a1b6-40f184c2c7af,chantajeando,14
1595697,79bdd445-ec6a-31a9-a227-266506dd69b0,muscle,14


In [56]:
from IPython.display import display

In [81]:
W = tf.sparse.SparseTensor(np.array([[1,2], [0, 2], [2, 4]]), np.array([1, 2, 3]), dense_shape=(3, 3))
bool(tf.reduce_all(tf.sparse.reduce_max(W, axis=1) > 0))

True

In [86]:
int(tf.reduce_max(tf.constant([[1,2], [0, 2], [2, 4]]))) + 1 <= 5

True

In [77]:
empty_ones = df.loc[df[["InSeasonSeries_Id", "token"]].duplicated(), "InSeasonSeries_Id"].unique()
check_empty = [[]]*len(empty_ones)

full_list = []
for i, iid in enumerate(empty_ones):
    iid = 23411
    sub = dff.loc[dff["InSeasonSeries_Id"] == out[iid]]
    #sub = sub.loc[sub["token"].isin(["", " "])]
    #if sub.shape[0] <1:
    #    print(iid)
    display(sub)
    full_list.append(sub)
    break

#full_list = pd.concat(full_list)
#full_list

Unnamed: 0,InSeasonSeries_Id,token,token_count
350808,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,TV,1
383703,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,Entertainment,1
1465878,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,fugaz,6
1465879,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,verano,6
1470443,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,uribe,6
1470444,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,disputaban,6
1475146,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,relacin,6
1475147,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,maradona,6
1475148,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,parte,6
1479644,b6a5e0cf-850c-33ba-9ce0-015536c6c5df,detalles,6


In [74]:
import tensorflow as tf

A = tf.convert_to_tensor(np.random.randint(-2, 10, size=(10, 2)))
B = tf.convert_to_tensor(np.random.rand(10))
print(A)
print(B)
mask = tf.reduce_all(A > 0, axis=1)
A = tf.boolean_mask(A, mask , axis=0)
B = tf.boolean_mask(B, mask, axis=0)
print(A)
print(B)

tf.Tensor(
[[ 9  4]
 [ 7  9]
 [ 8 -1]
 [ 0  1]
 [ 3 -2]
 [ 8  1]
 [ 9  4]
 [ 9 -1]
 [ 1  2]
 [ 7  4]], shape=(10, 2), dtype=int64)
tf.Tensor(
[0.28742401 0.46220533 0.56316589 0.26838281 0.02131566 0.68335294
 0.89996917 0.05704608 0.74942833 0.29260747], shape=(10,), dtype=float64)
tf.Tensor(
[[9 4]
 [7 9]
 [8 1]
 [9 4]
 [1 2]
 [7 4]], shape=(6, 2), dtype=int64)
tf.Tensor([0.28742401 0.46220533 0.68335294 0.89996917 0.74942833 0.29260747], shape=(6,), dtype=float64)


In [52]:


dff.loc[dff["InSeasonSeries_Id"] == "76f1d4f4-738d-3225-b6ae-a26212528a9a"]

Unnamed: 0,InSeasonSeries_Id,token,token_count
298340,76f1d4f4-738d-3225-b6ae-a26212528a9a,Entertainment,1
341728,76f1d4f4-738d-3225-b6ae-a26212528a9a,News,1
852428,76f1d4f4-738d-3225-b6ae-a26212528a9a,boss,3
897687,76f1d4f4-738d-3225-b6ae-a26212528a9a,first,3
987968,76f1d4f4-738d-3225-b6ae-a26212528a9a,news,3
987969,76f1d4f4-738d-3225-b6ae-a26212528a9a,on-air,3
1033200,76f1d4f4-738d-3225-b6ae-a26212528a9a,former,3
1033201,76f1d4f4-738d-3225-b6ae-a26212528a9a,office,3
1078191,76f1d4f4-738d-3225-b6ae-a26212528a9a,week,3
1078192,76f1d4f4-738d-3225-b6ae-a26212528a9a,steve,3


In [30]:
np.unique(np.concatenate([dff["InSeasonSeries_Id"].values, dff["token"].values])).shape

(154100,)

In [25]:
dff.iloc[[20777, 42008], :]

Unnamed: 0,InSeasonSeries_Id,token,token_count
20777,be9b4ce8-7b28-36bd-9612-86e8efa5b7c9,deliveries,21
42008,5eb34733-80da-3505-8c8a-575146e64718,-,48


In [41]:
path = "kubeflowpipelines-default_tfx_pipeline_output_node2vec_sports_syn_0_1_0_Transform_transform_graph_17258_transform_fn_assets_node_vocab_txt"
out = []
with open(path, "r") as fid:
    out = fid.readlines()
out = [kk.replace("\n", "") for kk in out]


['News',
 'TV',
 'Sports',
 'Talk',
 'covid-19',
 'new',
 'Entertainment',
 'Football',
 'Business & Finance',
 'coronavirus',
 'reports',
 '022ae9a1-d2ac-3238-b686-96c2a5ce26ba',
 'president',
 'discuss',
 'news',
 'trump',
 'joins',
 'today',
 'SportingEvent',
 'Comedy',
 'first',
 'mike',
 'show',
 'Subgenre:Talk',
 'Reality',
 'one',
 'us',
 'c04236ee-ca40-3a75-a008-230eaa805ba4',
 '39337ec8-062e-32ba-afc3-541adf683fce',
 'pandemic',
 'chris',
 'says',
 'joe',
 'nbc',
 'cnbcs',
 'former',
 'florio',
 '235b584f-ef32-38fe-8c51-e769f320257f',
 'day',
 'biden',
 '2020',
 'nbcs',
 'ddca41be-7eff-3441-96df-14b01bb41629',
 'world',
 'people',
 'talks',
 'watch',
 'get',
 'ed68bf9f-dd4f-34b0-805f-142f3483c997',
 'back',
 'house',
 'Soccer',
 'season',
 'time',
 '38cfc6e9-47ba-33e0-a857-143193c2b7d6',
 'look',
 'two',
 '2c1b4ec5-a425-32ed-a230-26638587fbec',
 'c909dfb4-1532-3d0c-a532-0a5b3798efec',
 'Drama',
 'a63d70f5-4640-348c-acc2-7b5c02ea5a65',
 '700e1a8a-dcdd-3bea-a86b-7a03a595f26a',
 

In [91]:
import psutil
psutil.virtual_memory().total / 1E9

16.823861248