In [1]:
import numpy as np
import pandas as pd
import gcsfs
from tqdm import tqdm
tqdm.pandas()
from google.cloud import bigquery
client = bigquery.Client()
from scipy.sparse import coo_matrix

# Make the data

In [316]:
# ContentMetadataView data
query = """
    CREATE TEMP FUNCTION strip_str_array(val ANY TYPE) AS ((
      SELECT ARRAY_AGG(DISTINCT TRIM(t))
      FROM UNNEST(val) t
      WHERE t != ""
    ));

    WITH source AS (
        SELECT ANY_VALUE(TitleDetails_title) AS title, TitleType AS type, 
            STRING_AGG(TitleDetails_LongSynopsis, " ") AS synopsis,
            LANGUAGE AS language,
            STRING_AGG(ARRAY_TO_STRING([TitleGenre, TitleSubgenres, TitleTags, LANGUAGE], ","), ",") AS tags
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.ContentMetadataView`
        WHERE LANGUAGE IN ("eng", "spa")
        AND TitleType IN ("Programme", "Movie")
        AND TitleDetails_title NOT IN ("OnDemand Movie")
        GROUP BY LOWER(TitleDetails_title), TitleType, LANGUAGE
    ),

    test_data AS (
        SELECT title, type, language, synopsis, 
           strip_str_array(SPLIT(tags, ",")) AS tags
        FROM source
    )

    SELECT * FROM test_data
"""
table_id = f"{client.project}.metadata_enhancement.synopsis_cmv_full_tags_edc_dev"
job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition="WRITE_TRUNCATE")

client.query(query=query, job_config=job_config,  location="US").result()

<google.cloud.bigquery.table.RowIterator at 0x7f3048d08ad0>

In [None]:
%%bigquery

    CREATE TEMP FUNCTION strip_str_array(val ANY TYPE) AS ((
      SELECT ARRAY_AGG(DISTINCT TRIM(t))
      FROM UNNEST(val) t
      WHERE t != ""
    ));

    WITH source AS (
        SELECT ANY_VALUE(TitleDetails_title) AS title, TitleType AS type, 
            STRING_AGG(TitleDetails_LongSynopsis, " ") AS synopsis,
            LANGUAGE AS language,
            STRING_AGG(ARRAY_TO_STRING([TitleGenre, TitleSubgenres, TitleTags, LANGUAGE], ","), ",") AS keywords
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.ContentMetadataView`
        WHERE LANGUAGE IN ("eng", "spa")
        AND TitleType IN ("Programme", "Movie")
        AND TitleDetails_title NOT IN ("OnDemand Movie")
        GROUP BY LOWER(TitleDetails_title), TitleType, LANGUAGE
    ),

    test_data AS (
        SELECT title, type, language, synopsis, 
           strip_str_array(SPLIT(keywords, ",")) AS keywords
        FROM source cmv
        LEFT JOIN `res-nbcupea-dev-ds-sandbox-001.recsystem.ContentOrdinalId` cid
            ON LOWER(cmv.title) = LOWER(cid.program_title)
    )

    SELECT * FROM test_data

Query complete after 0.01s: 100%|██████████| 8/8 [00:00<00:00, 2766.01query/s]                        
Downloading:  25%|██▌       | 44444/176668 [01:19<07:42, 285.92rows/s]

In [322]:
# mapping Centroid to dataset
query = """
    WITH orig AS (
        SELECT title AS program_title, type AS program_type, language AS program_language, 
            synopsis AS program_longsynopsis, tags
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.synopsis_cmv_full_tags_edc_dev`
    ),
    
    
    mapper AS (
        SELECT tags, centroid_word, cluster
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.keyword_clusters`
    ),
    
    collections AS (
        SELECT a.program_title, a.program_type, a.program_language, a.program_longsynopsis,
        ARRAY_AGG(DISTINCT b.centroid_word) AS tags
        FROM orig a
        JOIN mapper b
        ON b.tags IN UNNEST(a.tags)
        GROUP BY a.program_title, a.program_type, a.program_language, a.program_longsynopsis
    )
    
    SELECT program_title, program_type, program_language, program_longsynopsis, tags
    FROM collections
"""

table_id = f"{client.project}.metadata_enhancement.synopsis_cmv_167_clustered_tags"
job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition="WRITE_TRUNCATE")

client.query(query=query, job_config=job_config,  location="US").result()

<google.cloud.bigquery.table.RowIterator at 0x7f30485d3110>

In [324]:
%%bigquery
SELECT *
FROM metadata_enhancement.synopsis_cmv_167_clustered_tags
WHERE program_title = "The Office"

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 375.40query/s]                          
Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.58s/rows]


Unnamed: 0,program_title,program_type,program_language,program_longsynopsis,tags
0,The Office,Programme,eng,Michael is overly confident about his team's c...,"[teens (ages 13-14), Christmas, Comedy, Sitcom..."


In [323]:
%%bigquery
SELECT DISTINCT t
FROM metadata_enhancement.synopsis_cmv_167_clustered_tags,
UNNEST(tags) t

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 655.31query/s]                         
Downloading: 100%|██████████| 165/165 [00:01<00:00, 121.97rows/s]


Unnamed: 0,t
0,Art
1,eng
2,Drama
3,Christmas
4,Comedy
...,...
160,Restaurant owner
161,Auction
162,Sitcom
163,Auto racing


In [11]:
# Full merlin dataset
query = """
    WITH movie_base AS (
        SELECT DISTINCT * FROM (
            SELECT program_type, program_val, program_title, program_longsynopsis, program_language,
                    tag_value
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    INNER JOIN `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.node2vec_token_edc_dev` n2v
                    ON n2v.tokens = tag_value
                    WHERE program_type LIKE 'Movie'
                    AND program_title NOT LIKE 'OnDemand Movie'
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL
                    AND program_seriesid IS NULL
                    

            UNION ALL
            SELECT DISTINCT
                    program_type,
                    program_val,
                    program_title,
                    program_longsynopsis,
                    program_language,
                    program_type as tag_value
            FROM content_metadata.merlin_tags tags
            INNER JOIN content_metadata.merlin_program progs
            ON tags.program_id = progs.program_val
            WHERE program_type LIKE 'Movie'
            AND program_title NOT LIKE 'OnDemand Movie'
            AND (program_language LIKE 'eng'
                        OR
                program_language LIKE 'spa')
            AND program_longsynopsis IS NOT NULL
            AND program_seriesid IS NULL

            UNION ALL
            SELECT DISTINCT
                    program_type,
                    program_val,
                    program_title,
                    program_longsynopsis,
                    program_language,
                    program_language as tag_value
            FROM content_metadata.merlin_tags tags
            INNER JOIN content_metadata.merlin_program progs
            ON tags.program_id = progs.program_val
            WHERE program_type LIKE 'Movie'
            AND program_title NOT LIKE 'OnDemand Movie'
            AND (program_language LIKE 'eng'
                        OR
                program_language LIKE 'spa')
            AND program_longsynopsis IS NOT NULL
            AND program_seriesid IS NULL

        )
    ),

    movie_final AS (
        SELECT
            program_type,
            program_title,
            program_longsynopsis,
            program_language,
            ARRAY_AGG(DISTINCT mb.tag_value) tags
        FROM movie_base mb
        GROUP BY
            program_type,
            program_title,
            program_longsynopsis,
            program_language
    ),
    series_base AS (
        SELECT DISTINCT * FROM (
            SELECT program_type, program_val, program_title, program_longsynopsis, 
                    program_language,
                    program_seriesid,
                    tag_Value,
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    INNER JOIN `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.node2vec_token_edc_dev` n2v
                    ON tag_value = n2v.tokens
                    WHERE (
                        program_type LIKE 'Episode'
                            OR program_type LIKE 'SeriesMaster'
                    )
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL

            UNION ALL
            SELECT program_type, program_val, program_title, program_longsynopsis, program_language,
                    program_seriesid, "Episode" AS tag_value
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    WHERE (
                        program_type LIKE 'Episode'
                            OR program_type LIKE 'SeriesMaster'
                    )
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL

            UNION ALL
            SELECT program_type, program_val, program_title, program_longsynopsis, program_language,
                    program_seriesid, program_language AS tag_value
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    WHERE (
                        program_type LIKE 'Episode'
                            OR program_type LIKE 'SeriesMaster'
                    )
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL

        )
    ),
    
    series_final AS (
        SELECT
            "Episode" AS program_type,
            program_title,
            STRING_AGG(DISTINCT program_longsynopsis, " ") AS program_longsynopsis,
            program_language,
            ARRAY_AGG(DISTINCT sb.tag_value) tags
        FROM series_base sb
        GROUP BY
            program_title,
            program_language
    )
    SELECT * FROM movie_final
    UNION ALL
    SELECT * FROM series_final
"""
table_id = f"{client.project}.metadata_enhancement.synopsis_full_tags_edc_dev"
job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition="WRITE_TRUNCATE")

client.query(query=query, job_config=job_config,  location="US").result()

<google.cloud.bigquery.table.RowIterator at 0x7f7788b743d0>

In [21]:
query = """
    WITH movie_base AS (
        SELECT DISTINCT * FROM (
            SELECT program_type, program_val, program_title, program_longsynopsis, program_language,
                    tag_value, 
                    CASE 
                        WHEN tag_type IN ("Genre", "KidsTheme") OR tag_value = "not for kids" THEN tag_value
                        ELSE CAST(NULL AS STRING)
                    END AS genre
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    INNER JOIN `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.node2vec_token_edc_dev` n2v
                    ON n2v.tokens = tag_value
                    WHERE program_type LIKE 'Movie'
                    AND program_title NOT LIKE 'OnDemand Movie'
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL
                    AND program_seriesid IS NULL

            UNION ALL
            SELECT DISTINCT
                    program_type,
                    program_val,
                    program_title,
                    program_longsynopsis,
                    program_language,
                    program_type AS tag_value,
                    CAST(NULL AS STRING) AS tag_type
            FROM content_metadata.merlin_tags tags
            INNER JOIN content_metadata.merlin_program progs
            ON tags.program_id = progs.program_val
            WHERE program_type LIKE 'Movie'
            AND program_title NOT LIKE 'OnDemand Movie'
            AND (program_language LIKE 'eng'
                        OR
                program_language LIKE 'spa')
            AND program_longsynopsis IS NOT NULL
            AND program_seriesid IS NULL

            UNION ALL
            SELECT DISTINCT
                    program_type,
                    program_val,
                    program_title,
                    program_longsynopsis,
                    program_language,
                    program_language as tag_value,
                    CAST(NULL AS STRING) AS tag_type
            FROM content_metadata.merlin_tags tags
            INNER JOIN content_metadata.merlin_program progs
            ON tags.program_id = progs.program_val
            WHERE program_type LIKE 'Movie'
            AND program_title NOT LIKE 'OnDemand Movie'
            AND (program_language LIKE 'eng'
                        OR
                program_language LIKE 'spa')
            AND program_longsynopsis IS NOT NULL
            AND program_seriesid IS NULL

        )
    ),

    movie_final AS (
        SELECT
            program_type,
            program_title,
            program_longsynopsis,
            program_language,
            ARRAY_AGG(DISTINCT genre IGNORE NULLS) AS genre,
            ARRAY_AGG(DISTINCT mb.tag_value) tags
        FROM movie_base mb
        GROUP BY
            program_type,
            program_title,
            program_longsynopsis,
            program_language
    ),
    series_base AS (
        SELECT DISTINCT * FROM (
            SELECT program_type, program_val, program_title, program_longsynopsis, 
                    program_language,
                    program_seriesid,
                    tag_value,
                    CASE 
                        WHEN tag_type IN ("Genre", "KidsTheme") OR tag_value = "not for kids" THEN tag_value
                        ELSE CAST(NULL AS STRING)
                    END AS genre
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    INNER JOIN `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.node2vec_token_edc_dev` n2v
                    ON tag_value = n2v.tokens
                    WHERE (
                        program_type LIKE 'Episode'
                            OR program_type LIKE 'SeriesMaster'
                    )
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL

            UNION ALL
            SELECT program_type, program_val, program_title, program_longsynopsis, program_language,
                    program_seriesid, "Episode" AS tag_value, CAST(NULL AS STRING) AS genre
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    WHERE (
                        program_type LIKE 'Episode'
                            OR program_type LIKE 'SeriesMaster'
                    )
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL

            UNION ALL
            SELECT program_type, program_val, program_title, program_longsynopsis, program_language,
                    program_seriesid, program_language AS tag_value, CAST(NULL AS STRING) AS genre
                    FROM content_metadata.merlin_tags tags
                    INNER JOIN content_metadata.merlin_program progs
                    ON tags.program_id = progs.program_val
                    WHERE (
                        program_type LIKE 'Episode'
                            OR program_type LIKE 'SeriesMaster'
                    )
                    AND (program_language LIKE 'eng'
                        OR
                        program_language LIKE 'spa')
                    AND program_longsynopsis IS NOT NULL

        )
    ),
    
    series_final AS (
        SELECT
            "Episode" AS program_type,
            program_title,
            STRING_AGG(DISTINCT program_longsynopsis, " ") AS program_longsynopsis,
            program_language,
            ARRAY_AGG(DISTINCT sb.genre IGNORE NULLS) AS genre,
            ARRAY_AGG(DISTINCT sb.tag_value) tags
        FROM series_base sb
        GROUP BY
            program_title,
            program_language
    )
    SELECT * FROM movie_final
    UNION ALL
    SELECT * FROM series_final
"""
table_id = f"{client.project}.metadata_enhancement.synopsis_full_tags_genre_edc_dev"
job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition="WRITE_TRUNCATE")

client.query(query=query, job_config=job_config,  location="US").result()

<google.cloud.bigquery.table.RowIterator at 0x7f774ae118d0>

# Clustering the keywords / tags

In [288]:
# Cluster the keywords
query = """
SELECT DISTINCT tags
FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.synopsis_full_tags_genre_edc_dev`,
UNNEST(tags) tags
"""

df_words = client.query(query=query, location="US").to_dataframe()
df_words

Unnamed: 0,tags
0,DJ
1,Music
2,Creative
3,California
4,Movie
...,...
5000,Chiméres
5001,President Aristide
5002,Port-au-Prince
5003,Seminola College


In [289]:
special_labels = ["eng", "spa", "Episode", "Movie", 'older teens (ages 15+)',
                'tweens (ages 10-12)',
                'teens (ages 13-14)',
                'big kids (ages 8-9)',
                'little kids (ages 5-7)',
                'preschoolers (ages 2-4)',
                'not for kids']
custom_kept_labels = ['Documentary', 'Comedy', 'Drama', 'Thriller', 'Horror', 'Romance',
       'Sitcom', 'Action & Adventure', 'Fantasy', 'Crime', 'Mystery',
       "Children's/Family Entertainment", 'History', 'Science fiction',
       'Sports', 'Educational', 'Animated', 'Military & War',
       'Western', 'Gay and Lesbian']

df_words = df_words.loc[~df_words["tags"].isin(special_labels)]
df_words

Unnamed: 0,tags
0,DJ
1,Music
2,Creative
3,California
5,Pursuit
...,...
5000,Chiméres
5001,President Aristide
5002,Port-au-Prince
5003,Seminola College


In [290]:
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

model = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2")

dataset = tf.data.Dataset.from_tensor_slices(df_words["tags"].values).batch(50)
res = []
for batch in tqdm(dataset):
     res.append(model(batch))
df_words["embed"] = list(tf.concat(res, axis=0).numpy())
df_words.to_pickle("./scratch/hub_embed_keywords_embed2.pkl")
df_words





  0%|          | 0/100 [00:00<?, ?it/s]



100%|██████████| 100/100 [00:00<00:00, 675.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,tags,embed
0,DJ,"[-0.011037577, 0.06778954, -0.081553265, -0.04..."
1,Music,"[-0.057594366, 0.16248569, -0.12246627, 0.0655..."
2,Creative,"[-0.0642648, 0.20921557, -0.056612816, 0.06104..."
3,California,"[-0.00453534, 0.03185876, 0.19379067, -0.04166..."
5,Pursuit,"[-0.110649586, 0.09063735, 0.026033552, -0.002..."
...,...,...
5000,Chiméres,"[-0.20667349, -0.004094101, 0.026103506, 0.037..."
5001,President Aristide,"[-0.08246881, -0.0019505324, 0.032192685, 0.02..."
5002,Port-au-Prince,"[-0.12202282, 0.05105583, -0.09921123, 0.10520..."
5003,Seminola College,"[-0.1269239, 0.018460747, -0.020410243, 0.0417..."


In [291]:
## Clustering
from sklearn.cluster import KMeans
kmeans_clust = KMeans(n_clusters=140, random_state=42)
# Assign clusters
df_words["cluster"] = kmeans_clust.fit_predict(np.stack(df_words["embed"].values, axis=0))
df_words = df_words.sort_values("cluster").reset_index(drop=True)
# Figure out a "centeroid word"
# get the centroid
def centroid_word_func(pdf):
    values = np.stack(pdf["embed"].values, axis=0)
    centroid = values.mean(axis=0, keepdims=True)
    dist = np.sum((values - centroid)**2, axis=1)
    min_index = np.argmin(dist)
    pdf["centroid_word"] = pdf["tags"].iloc[min_index]
    pdf["num_words"] = pdf.shape[0]
    return pdf

df_words = df_words.groupby(by="cluster", as_index=False).progress_apply(centroid_word_func)
df_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
100%|██████████| 140/140 [00:00<00:00, 576.18it/s]


Unnamed: 0,tags,embed,cluster,centroid_word,num_words
0,Candy cane,"[-0.08472573, -0.14358519, 0.10053414, -0.0518...",0,Toy factory,11
1,Toy,"[-0.06649739, 0.09958365, 0.0536462, 0.0273038...",0,Toy factory,11
2,Toy factory,"[-0.048868295, 0.13744843, 0.12971462, 0.04136...",0,Toy factory,11
3,Toy maker,"[-0.053410415, 0.023874693, 0.012954429, -0.06...",0,Toy factory,11
4,Toys,"[-0.09476787, 0.15568548, -0.095866, 0.0462267...",0,Toy factory,11
...,...,...,...,...,...
4989,Affliction,"[-0.106540464, 0.015190207, -0.09322527, 0.031...",139,Punk,39
4990,Street punk,"[-0.082332194, 0.12206013, -0.004195919, 0.096...",139,Punk,39
4991,Punk rocker,"[-0.042677656, -0.020117873, -0.091137476, -0....",139,Punk,39
4992,Undertaker,"[-0.07260105, -0.020444859, -0.20298453, 0.053...",139,Punk,39


In [292]:
# Go through the custom list
for tt in custom_kept_labels:
    df_words[tt] = df_words["tags"].str.contains(tt[:4])
    
indices = [np.where(ii)[0][0]  if len(np.where(ii)[0]) > 0 else -1 for ii in df_words.iloc[:, 5:].values]
for jj, ii in enumerate(indices):
    if ii < 0: continue
    df_words.loc[jj, "centroid_word"] = custom_kept_labels[ii]
df_words

Unnamed: 0,tags,embed,cluster,centroid_word,num_words,Documentary,Comedy,Drama,Thriller,Horror,...,Mystery,Children's/Family Entertainment,History,Science fiction,Sports,Educational,Animated,Military & War,Western,Gay and Lesbian
0,Candy cane,"[-0.08472573, -0.14358519, 0.10053414, -0.0518...",0,Toy factory,11,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Toy,"[-0.06649739, 0.09958365, 0.0536462, 0.0273038...",0,Toy factory,11,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Toy factory,"[-0.048868295, 0.13744843, 0.12971462, 0.04136...",0,Toy factory,11,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Toy maker,"[-0.053410415, 0.023874693, 0.012954429, -0.06...",0,Toy factory,11,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Toys,"[-0.09476787, 0.15568548, -0.095866, 0.0462267...",0,Toy factory,11,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4989,Affliction,"[-0.106540464, 0.015190207, -0.09322527, 0.031...",139,Punk,39,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4990,Street punk,"[-0.082332194, 0.12206013, -0.004195919, 0.096...",139,Punk,39,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4991,Punk rocker,"[-0.042677656, -0.020117873, -0.091137476, -0....",139,Punk,39,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4992,Undertaker,"[-0.07260105, -0.020444859, -0.20298453, 0.053...",139,Punk,39,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [293]:
df_words = df_words.drop(columns=custom_kept_labels)
df_words

Unnamed: 0,tags,embed,cluster,centroid_word,num_words
0,Candy cane,"[-0.08472573, -0.14358519, 0.10053414, -0.0518...",0,Toy factory,11
1,Toy,"[-0.06649739, 0.09958365, 0.0536462, 0.0273038...",0,Toy factory,11
2,Toy factory,"[-0.048868295, 0.13744843, 0.12971462, 0.04136...",0,Toy factory,11
3,Toy maker,"[-0.053410415, 0.023874693, 0.012954429, -0.06...",0,Toy factory,11
4,Toys,"[-0.09476787, 0.15568548, -0.095866, 0.0462267...",0,Toy factory,11
...,...,...,...,...,...
4989,Affliction,"[-0.106540464, 0.015190207, -0.09322527, 0.031...",139,Punk,39
4990,Street punk,"[-0.082332194, 0.12206013, -0.004195919, 0.096...",139,Punk,39
4991,Punk rocker,"[-0.042677656, -0.020117873, -0.091137476, -0....",139,Punk,39
4992,Undertaker,"[-0.07260105, -0.020444859, -0.20298453, 0.053...",139,Punk,39


In [294]:
# Adding the special words
df_special = pd.DataFrame({"tags": [kk for kk in special_labels], 
                           "embed" : [[]] * len(special_labels),
                           "cluster": np.arange(140, 140+len(special_labels)),
                           "centroid_word": [kk for kk in special_labels],
                           "num_words": np.ones(len(special_labels), dtype=int)}
                          )
df_final_map = df_words.append(df_special)
df_final_map



Unnamed: 0,tags,embed,cluster,centroid_word,num_words
0,Candy cane,"[-0.08472573, -0.14358519, 0.10053414, -0.0518...",0,Toy factory,11
1,Toy,"[-0.06649739, 0.09958365, 0.0536462, 0.0273038...",0,Toy factory,11
2,Toy factory,"[-0.048868295, 0.13744843, 0.12971462, 0.04136...",0,Toy factory,11
3,Toy maker,"[-0.053410415, 0.023874693, 0.012954429, -0.06...",0,Toy factory,11
4,Toys,"[-0.09476787, 0.15568548, -0.095866, 0.0462267...",0,Toy factory,11
...,...,...,...,...,...
6,teens (ages 13-14),[],146,teens (ages 13-14),1
7,big kids (ages 8-9),[],147,big kids (ages 8-9),1
8,little kids (ages 5-7),[],148,little kids (ages 5-7),1
9,preschoolers (ages 2-4),[],149,preschoolers (ages 2-4),1


In [295]:
df_final_map["centroid_word"].unique().shape

(167,)

In [296]:
df_final_map.to_gbq("metadata_enhancement.keyword_clusters",
    project_id=client.project,
    if_exists='replace')

1it [00:11, 11.49s/it]


# Mapping centroid words to dataset

In [25]:
query = """
    WITH orig AS (
        SELECT program_title, program_type, program_language, program_longsynopsis, genre, tags
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.synopsis_full_tags_genre_edc_dev`
    ),
    
    
    mapper AS (
        SELECT tags, centroid_word, cluster
        FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.keyword_clusters`
    ),
    
    collections AS (
        SELECT a.program_title, a.program_type, a.program_language, a.program_longsynopsis,
        ANY_VALUE(a.genre) AS genre,
        ANY_VALUE(a.tags) AS keywords,
        ARRAY_AGG(DISTINCT b.centroid_word) AS tags
        FROM orig a
        JOIN mapper b
        ON b.tags IN UNNEST(a.tags)
        GROUP BY a.program_title, a.program_type, a.program_language, a.program_longsynopsis
    )
    
    SELECT program_title, program_type, program_language, program_longsynopsis, genre, keywords, tags
    FROM collections
    WHERE program_title IS NOT NULL 
        AND program_type IS NOT NULL
        AND program_longsynopsis IS NOT NULL
        AND ARRAY_LENGTH(tags) > 0
"""

table_id = f"{client.project}.metadata_enhancement.synopsis_genres_167_clustered_tags"
job_config = bigquery.QueryJobConfig(destination=table_id, write_disposition="WRITE_TRUNCATE")

client.query(query=query, job_config=job_config, location="US").result()


<google.cloud.bigquery.table.RowIterator at 0x7f7771fc8810>

In [311]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding",
    "Un adelanto y los comentarios de Kevin Costner y el elenco del programa. Un secreto sobre John sale a la luz; Beth le muestra a Jenkins una noche difícil; un tiempo especial con Tate conduce a casi un desastre. Con el calor del verano la naturaleza de Yellowstone se ha mostrado en toda su plenitud, en tan solo unos pocos meses ha florecido de manera extraordinaria. Pero ahora en tan solo unas semanas volverá la nieve y el frío."
])

print(embeddings)













tf.Tensor(
[[-0.07796571 -0.07574748  0.02141027 ... -0.01937356  0.05496128
   0.00755278]
 [-0.06348769 -0.03529344  0.03835971 ... -0.00586085 -0.01329273
   0.10802048]
 [ 0.06036134  0.01039356 -0.07722514 ... -0.06877957  0.05509286
  -0.03794941]], shape=(3, 512), dtype=float32)


In [312]:
summ = 0
for ii in embed.variables:
    summ += (np.prod(ii.shape))
print(summ)



68927232


# How does raw content to content look?

In [330]:
%%bigquery tag_index

SELECT DISTINCT tags
FROM `res-nbcupea-dev-ds-sandbox-001.metadata_enhancement.synopsis_cmv_167_clustered_tags`,
UNNEST(tags) tags

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 672.87query/s]                         
Downloading: 100%|██████████| 165/165 [00:01<00:00, 109.85rows/s]


In [335]:
tag_list = tag_index["tags"].tolist()
tag_list

['Art',
 'eng',
 'Drama',
 'Christmas',
 'Comedy',
 'Romance',
 'Army',
 'Car',
 'Crime',
 'Race',
 'Action & Adventure',
 'Mountain',
 'Spirit',
 'Comedian',
 'Animal',
 'Soldiers',
 'Grandmother',
 'Murder',
 'Bounty hunter',
 'American South',
 'Pervert',
 'Su-Su',
 'Thrilling',
 'Thriller',
 'Mystery',
 'Extortion',
 'Zombies',
 'Warrior',
 'Austria',
 'older teens (ages 15+)',
 'Dracula',
 'Confusion',
 'Running away',
 '5th century',
 'City',
 'Reality',
 'Prison',
 'Aviation',
 'History',
 'War',
 'Military & War',
 'Poetry',
 'teens (ages 13-14)',
 'Sexy woman',
 'Russian mob',
 'Family',
 '1870s',
 "Children's/Family Entertainment",
 'White House',
 'New Zealand',
 'Adolescent boy',
 'Fantasy',
 'Political turmoil',
 'Emperor',
 'Equestrian',
 'Science',
 'Richard Nixon',
 'John Holmes',
 'Death of parent',
 'Missouri',
 'Professionalism',
 'Action figure',
 'Racism',
 'Family member',
 'Road',
 'Chicago',
 'Amusement park',
 'Computer',
 'Lounge',
 'Christianity',
 'Mother/da

In [9]:
%%bigquery unscored_titles

WITH cid AS (
    SELECT DISTINCT program_title, content_ordinal_id
    FROM `res-nbcupea-dev-ds-sandbox-001.recsystem.ContentOrdinalId`
)

SELECT a.program_title, a.program_type, a.program_longsynopsis, a.program_language, 
    STRING_AGG(DISTINCT t, " ") AS keywords, b.content_ordinal_id
FROM `metadata_enhancement.synopsis_cmv_167_clustered_tags` a,
UNNEST(a.tags) t
JOIN cid b
ON LOWER(a.program_title) = LOWER(b.program_title)
GROUP BY a.program_title, a.program_type, a.program_language, a.program_longsynopsis, b.content_ordinal_id


Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 491.08query/s] 
Downloading: 100%|██████████| 2286/2286 [00:02<00:00, 952.35rows/s] 


In [341]:
def tags_to_index(tags, tags_list):
    col = np.array([tags_list.index(t) for t in tags], dtype=int)
    data = np.ones(len(col), dtype=int)
    row = np.zeros(len(col), dtype=int)
    return coo_matrix((data, (row, col)), shape=(1, len(tags_list))).A[0]

df_test["labels"] = df_test["tags"].progress_apply(lambda x: tags_to_index(x, tag_list))
df_test

100%|██████████| 2286/2286 [00:00<00:00, 11166.91it/s]


Unnamed: 0,program_title,program_type,program_longsynopsis,program_language,tags,content_ordinal_id,labels
0,Christmas Matchmakers,Movie,Two overworked personal assistants hatch a pla...,eng,"[Art, eng, Drama, Christmas, Comedy, Romance]",1678,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,John Wick 2,Movie,Legendary hit man John Wick comes out of retir...,eng,"[Art, Army, Car, Crime, Race, eng, Action & Ad...",2270,"[1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Ashes in the Snow,Movie,"During World War II, a 16-year-old artist and ...",eng,"[Art, Comedy, Austria, Reality, Romance, Priso...",111,"[1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Alice in Wonderland,Movie,Little Alice (Charlotte Henry) meets the Chesh...,eng,"[Art, Aviation, 1870s, Children's/Family Enter...",62,"[1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, ..."
4,Thomas Kinkade's Christmas Cottage,Movie,"Inspired by his mentor (Peter O'Toole), a youn...",eng,"[Art, Adolescent boy, Death of parent, Mountai...",954,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ..."
...,...,...,...,...,...,...,...
2281,Fievel's American Tails,Programme,Fievel and the Mousekewitz family. Animated. F...,eng,"[Children's/Family Entertainment, Adolescent b...",348,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
2282,DreamWorks Nursery Rhymes,Programme,"You'll feel great moving around to ""Ready To G...",eng,"[Children's/Family Entertainment, Adolescent b...",306,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2283,Baby Einstein,Programme,A children's safari that encourages vocal part...,eng,"[Children's/Family Entertainment, Adolescent b...",120,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2284,Stone Age,Programme,"It's not Billy's day today, and with everythin...",eng,"[Children's/Family Entertainment, Adolescent b...",1142,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [345]:
def cosine_sim(P):
    P = P / np.sqrt(np.sum(P**2, axis=1, keepdims=True))
    cos_sim_c2c = P @ P.T
    cos_sim_c2c = np.nan_to_num(cos_sim_c2c, nan=-1)
    return cos_sim_c2c

labels = np.stack(df_test["labels"].values, axis=0)
similarity = cosine_sim(labels)

# Slice out top 15 recommendations
score = list(np.sort(similarity, axis=1)[:, ::-1][:, 1:(15+1)])
sim_c2c_argsort = np.argsort(similarity, axis=1)[:, ::-1][:, 1:]
titles = list(np.take(df_test["program_title"].values, sim_c2c_argsort[:, :15]))
titles_type = list(np.take(df_test["program_type"].values, sim_c2c_argsort[:, :15]))
synopsis = list(np.take(df_test["program_longsynopsis"].values, sim_c2c_argsort[:, :15]))
dict_list = [{"program_title": tt, "program_type": ttype, "program_longsynopsis": syn,  "score": sc} \
             for tt, ttype, syn, sc in zip(titles, titles_type, synopsis, score)]

df_test["top15"] = dict_list
# Calculate a type match
df_test["type_match"] = (np.stack(titles_type) == df_test["program_type"][:, None]).mean(axis=1)

df_test



Unnamed: 0,program_title,program_type,program_longsynopsis,program_language,tags,content_ordinal_id,labels,top15,type_match
0,Christmas Matchmakers,Movie,Two overworked personal assistants hatch a pla...,eng,"[Art, eng, Drama, Christmas, Comedy, Romance]",1678,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'program_title': ['A Christmas Princess', 'A ...",1.000000
1,John Wick 2,Movie,Legendary hit man John Wick comes out of retir...,eng,"[Art, Army, Car, Crime, Race, eng, Action & Ad...",2270,"[1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","{'program_title': ['Precious Cargo', 'John Wic...",1.000000
2,Ashes in the Snow,Movie,"During World War II, a 16-year-old artist and ...",eng,"[Art, Comedy, Austria, Reality, Romance, Priso...",111,"[1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'program_title': ['Schindler's List', 'Cesar ...",1.000000
3,Alice in Wonderland,Movie,Little Alice (Charlotte Henry) meets the Chesh...,eng,"[Art, Aviation, 1870s, Children's/Family Enter...",62,"[1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, ...","{'program_title': ['Essential Killing', 'Curio...",1.000000
4,Thomas Kinkade's Christmas Cottage,Movie,"Inspired by his mentor (Peter O'Toole), a youn...",eng,"[Art, Adolescent boy, Death of parent, Mountai...",954,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",{'program_title': ['Girl With a Pearl Earring'...,1.000000
...,...,...,...,...,...,...,...,...,...
2281,Fievel's American Tails,Programme,Fievel and the Mousekewitz family. Animated. F...,eng,"[Children's/Family Entertainment, Adolescent b...",348,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","{'program_title': ['My Life Me', 'Fievel's Ame...",0.866667
2282,DreamWorks Nursery Rhymes,Programme,"You'll feel great moving around to ""Ready To G...",eng,"[Children's/Family Entertainment, Adolescent b...",306,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'program_title': ['Timmy the Tooth', 'Blue Ba...",0.866667
2283,Baby Einstein,Programme,A children's safari that encourages vocal part...,eng,"[Children's/Family Entertainment, Adolescent b...",120,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'program_title': ['Little Baby Bum', 'Guess W...",0.933333
2284,Stone Age,Programme,"It's not Billy's day today, and with everythin...",eng,"[Children's/Family Entertainment, Adolescent b...",1142,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","{'program_title': ['My Life Me', 'Fievel's Ame...",0.866667


In [346]:
from IPython.display import display
def query_shows_c2c(df_titles, show_name, show_res=True):
    pdf = df_titles.loc[df_titles["program_title"]==show_name, :]
    pdf_query = pdf[["program_title", "program_type", "program_longsynopsis"]]
    pdf_res = pd.DataFrame(pdf[f"top15"].values[0])
    if show_res:
        print("Query")
        display(pdf_query)
        display(pdf_res)
    
    return pdf_query, pdf_res


In [348]:
important_titles = ["The Office", "30 Rock", "Punky Brewster", "Parks and Recreation", "WWE Monday Night RAW", 
            "Yellowstone", "Saturday Night Live", "Law & Order: Special Victims Unit", 
            "Mr. Mercedes", "Happy Feet Two", "Zombie Tidal Wave"]

for ti in important_titles:
    _ = query_shows_c2c(df_test, ti)

Query


Unnamed: 0,program_title,program_type,program_longsynopsis
2250,The Office,Programme,Michael is overly confident about his team's c...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,Grace Under Fire,Programme,Grace's Valentine's Day date with suitor Ryan ...,1.0
1,Everybody Loves Raymond,Programme,"After having a fight with Frank, Marie moves i...",1.0
2,The Jeff Foxworthy Show,Programme,A fishing trip with Karen and two friends goes...,1.0
3,George Lopez,Programme,George and Benny try to fill jailed Jim's spot...,0.912871
4,Grounded for Life,Programme,"When Sean receives a small inheritance, he mus...",0.912871
5,Perfect Harmony,Programme,When Reverend Jax is visited by his charismati...,0.912871
6,Odd Mom Out,Programme,Jill falls in love with the down-to-earth moms...,0.894427
7,Dennis the Menace,Programme,The widowed Grandpa Perkins visits but resists...,0.894427
8,Roseanne,Programme,Dan's 15-year-old secret is exposed during pre...,0.845154
9,The Goode Family,Programme,Gerald tries to inspire his children by adopti...,0.8


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
2046,30 Rock,Programme,Jack meets with German TV executives to close ...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,Humor Me,Movie,An aging father refuses to engage emotionally ...,0.6
1,The Big Wedding,Movie,"A long-divorced couple (Robert De Niro, Diane ...",0.564288
2,Saturday Night Live,Programme,Talented actors and comedians who are accompan...,0.562254
3,Fatal Attraction,Movie,A New York lawyer (Michael Douglas) with a wif...,0.55
4,The TV Set,Movie,A network picks up Mike Klein's (David Duchovn...,0.536745
5,The Hi-Lo Country,Movie,A rift develops between longtime friends (Wood...,0.536745
6,What to Expect When You're Expecting,Movie,Pregnancy hormones wreak havoc on a baby-crazy...,0.536656
7,Love Actually,Movie,"A prime minister, an office worker (Hugh Grant...",0.527046
8,Superstore,Programme,Amy and Jonah return from a four-month suspens...,0.51465
9,Pirate Radio,Movie,In 1966 a young man joins a host of rock-music...,0.512878


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
2251,Punky Brewster,Programme,Punky makes a date with two boys for the same ...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,Good Times,Programme,Florida and James fear that there is more betw...,0.83666
1,Saved by the Bell,Programme,Coming Soon Zack's antics get Slater thrown ou...,0.8
2,Everybody Hates Chris,Programme,Chris is put in charge of doing the family's l...,0.774597
3,Saved by the Bell: The College Years,Programme,"Zack, Slater, Alex and Kelly (Mark-Paul Gossel...",0.774597
4,Sunnyside,Programme,Garrett helps Mei Lin and Jun Ho prepare for a...,0.717137
5,Angels Sing,Movie,"During the holidays, a mysterious stranger (Wi...",0.717137
6,Christmas for a Dollar,Movie,"During the Great Depression, the Kamp family s...",0.707107
7,I'll Be Homeless for Christmas,Movie,After discovering that his latest mark is a si...,0.707107
8,The Munsters,Programme,Herman is fired from his job at the funeral pa...,0.707107
9,Modern Family,Programme,Luke and Manny's hot-tempered basketball coach...,0.7


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
1813,Parks and Recreation,Programme,Leslie must chose between Ben and her dream of...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,Cheers,Programme,Diane Chambers ends up as a waitress in a Bost...,0.762713
1,Frasier,Programme,Frasier must decide whether or not to attend h...,0.707107
2,Modern Family,Programme,Luke and Manny's hot-tempered basketball coach...,0.632456
3,Beware the Gonzo,Movie,Eddie starts an underground movement to give a...,0.612372
4,George Lopez,Programme,George and Benny try to fill jailed Jim's spot...,0.612372
5,Inside the Rain,Movie,Facing expulsion from college over a misunders...,0.612372
6,Ping Pong Summer,Movie,"In 1985 a summer vacation in Ocean City, Md., ...",0.583333
7,Two and a Half Men,Programme,Charlie unknowingly sleeps with an elderly man...,0.583333
8,The Tonight Show Starring Jimmy Fallon,Programme,Actor Jesse Eisenberg; actress Hailee Steinfel...,0.566947
9,The Humbling,Movie,"Following a breakdown and suicide attempt, an ...",0.566947


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
2227,WWE Monday Night RAW,Programme,Stone Cold Steve Austin vs. Vader. The Underta...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,The Big Break,Programme,"Both teams try their luck in ""Take a Chance"" a...",0.75
1,The Men in Blazers Show,Programme,A review of the weekend's Premier League match...,0.75
2,WrestleMania Rewind,Programme,The Rock goes one-on-one with Hulk Hogan at Wr...,0.707107
3,Celebrating Latinas in music,Programme,"On the September 22nd edition of ""E! Pop on Pe...",0.707107
4,The Rich Eisen Show,Programme,Rich Eisen's daily national sports talk progra...,0.707107
5,WrestleMania,Programme,The historic first WrestleMania from Madison S...,0.707107
6,The Best of WWE,Programme,Beer bashes in the ring. Shocking Stunners on ...,0.707107
7,WWE The Bump,Programme,Featuring WWE Superstar special guests and mor...,0.707107
8,The 50 Greatest,Programme,It's WWE's top 10 greatest female Superstars o...,0.707107
9,Royal Rumble,Programme,Twenty Superstars compete in the first-ever Ro...,0.707107


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
2264,Yellowstone,Programme,Rip stumbles upon a dangerous and dire situati...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,The Capture,Programme,Coming Soon Rachel and Shaun come face to face...,0.866025
1,Mr. Mercedes,Programme,Suspicions are confirmed when another victim i...,0.866025
2,Law & Order,Programme,"When a notorious hustler is found dead, Cyrus ...",0.774597
3,Suits,Programme,Harvey finds his past coming back to haunt him...,0.774597
4,All We Had,Movie,Set during the worldwide financial crisis in 2...,0.774597
5,Debris,Programme,Two agents from two different continents and t...,0.774597
6,Safe Harbour,Programme,"In the middle of the Timor Sea, five Australia...",0.774597
7,Treadstone,Programme,Doug solves a problem; Edwards has a breakthro...,0.774597
8,Five Bedrooms,Programme,"Ben, Ainsley, Harry and Liz begin their housew...",0.774597
9,Brave New World,Programme,Danger awaits Bernard and Lenina in the Savage...,0.774597


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
2048,Saturday Night Live,Programme,Talented actors and comedians who are accompan...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,Late Night With Seth Meyers,Programme,Musician Jeff Goldblum; comic Jacqueline Novak...,0.612672
1,30 Rock,Programme,Jack meets with German TV executives to close ...,0.562254
2,Beyond the Sea,Movie,An older Bobby Darin (Kevin Spacey) tells his ...,0.543075
3,A Little Late With Lilly Singh,Programme,"Actors Mackenzie Davis, Natalia Reyes, Diego B...",0.536088
4,Burlesque,Movie,With help from a savvy stage manager and a gen...,0.524304
5,Dateline NBC,Programme,A man disappears after a duck hunting trip in ...,0.510113
6,Hunky Dory,Movie,"In the summer of 1976, a British drama teacher...",0.502895
7,Mamma Mia!,Movie,Hoping to meet her real father and have him wa...,0.500278
8,The Kids Are Alright,Movie,"Interviews, TV clips and concert footage make ...",0.494451
9,Parks and Recreation,Programme,Leslie must chose between Ben and her dream of...,0.493915


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
1733,Law & Order: Special Victims Unit,Programme,When detective Benson gets into the middle of ...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,Chicago P.D.,Programme,A string of bombings is targeting members of t...,0.948683
1,Covert Affairs,Programme,Annie begins her deep cover operation on her o...,0.816497
2,Blindspot,Programme,Jane and Weller's romantic getaway is cut shor...,0.777778
3,The InBetween,Programme,Cassie visits the hospital where Damien's fian...,0.755929
4,The Blacklist,Programme,Ressler is forced to confront past trauma and ...,0.755929
5,Chicago Fire,Programme,Dawson tries to find balance with Lt. Casey; M...,0.755929
6,Bite Club,Programme,Zoe and Dan suspect that two brothers are resp...,0.745356
7,Law & Order,Programme,"When a notorious hustler is found dead, Cyrus ...",0.745356
8,Treadstone,Programme,Doug solves a problem; Edwards has a breakthro...,0.745356
9,Miami Vice,Programme,Miami police detective Sonny Crockett reluctan...,0.745356


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
2269,Mr. Mercedes,Programme,Suspicions are confirmed when another victim i...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,The Capture,Programme,Coming Soon Rachel and Shaun come face to face...,1.0
1,Treadstone,Programme,Doug solves a problem; Edwards has a breakthro...,0.894427
2,Safe Harbour,Programme,"In the middle of the Timor Sea, five Australia...",0.894427
3,The Fall,Programme,The killer's latest attack gives Gibson renewe...,0.894427
4,Sacred Lies: The Singing Bones,Programme,Harper has a breakthrough connecting the Cherr...,0.866025
5,Yellowstone,Programme,Rip stumbles upon a dangerous and dire situati...,0.866025
6,Ray Donovan,Programme,Ray makes an incriminating video; Ezra wanders...,0.816497
7,Save Me,Programme,Nelly confronts Melon and questions him over J...,0.816497
8,The Purge,Programme,Jane considers the morality of Purge; Miguel s...,0.816497
9,Briarpatch,Programme,"Sad thing, funerals. Jake throws a party; Alle...",0.816497


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
234,Happy Feet Two,Movie,"Reluctant to dance, the son of Mumble (Elijah ..."


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,The Land Before Time XIV: Journey of the Brave,Movie,Littlefoot goes on a journey to find his fathe...,0.734847
1,Happy Feet,Movie,"Born without the ability to sing, a young empe...",0.69282
2,The Land Before Time: The Wisdom of Friends,Movie,Littlefoot and friends help two clumsy dinosau...,0.642364
3,Trolls World Tour,Movie,When a rock 'n' roll king and queen set out to...,0.639602
4,The Land Before Time,Movie,An orphaned dinosaur and his new friends face ...,0.57735
5,Golden Winter,Movie,A boy helps abandoned golden retriever puppies...,0.571548
6,The Jungle Bunch,Movie,A penguin raised by tigers defends his jungle ...,0.56
7,Mamma Mia!,Movie,Hoping to meet her real father and have him wa...,0.557086
8,"Curious George: Go West, Go Wild",Movie,"While farm-sitting, Ted goes fishing and Georg...",0.554322
9,Amanda and Jack Go Glamping,Movie,With his marriage and career against the ropes...,0.550598


Query


Unnamed: 0,program_title,program_type,program_longsynopsis
435,Zombie Tidal Wave,Movie,Zombies wreak bloodthirsty havoc after a tidal...


Unnamed: 0,program_title,program_type,program_longsynopsis,score
0,Zombie Tidal Wave,Movie,Zombies wreak bloodthirsty havoc after a tidal...,1.0
1,Killer High,Movie,Sabrina's perfectly planned high school reunio...,0.92582
2,Neverknock,Movie,After Grace and her friends ignore the warning...,0.857143
3,Campus Code,Movie,When one of their classmates disintegrates rig...,0.857143
4,The Mummy's Curse,Movie,"Shipped to Louisiana, mummy Kharis (Lon Chaney...",0.845154
5,The Invisible Man's Revenge,Movie,A criminal (Jon Hall) haunts his enemies' mans...,0.845154
6,The Invisible Woman,Movie,A screwy professor's experiments with a gorgeo...,0.845154
7,Scorched Earth,Movie,A post-apocalyptic bounty hunter tries to brin...,0.845154
8,Body Bags,Movie,Filmmaker John Carpenter introduces a trio of ...,0.845154
9,Android Cop,Movie,A cop and his android partner enter a forbidde...,0.845154
