In [1]:
import logging, re
import apache_beam as beam
from apache_beam.io import WriteToText
from apache_beam.io.gcp.bigquery import ReadFromBigQuery, WriteToBigQuery

    
'''
5/6/2021
The following ParDo 'MakeHasGenre' function creates a junction table for the genres of a particular titleID.
It takes in the titleID and list of genres as a string and creates a record for each unique titleID
and genre combination. The FARM_FINGERPRINT method would not work for the creation of a unqiue id for genreID,
so that was performed after in bigquery with MD5.
'''
class MakeHasGenre(beam.DoFn):
    def process(self, element):
        titleID = element['titleID']
        genres = element['genres']
        if genres is not None:
            genres_list = genres.split(',')
            records_list = []
            for genre in genres_list:
                record = {'titleID': titleID, 'genreID': genre}
                records_list.append(record)
            return records_list    
        
def run():
    PROJECT_ID = 'coastal-well-303101'
    BUCKET = 'gs://allnaturalbrandy2021/temp'
    
    options = {
        'project': PROJECT_ID
    }
    
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    p = beam.Pipeline('DirectRunner', options=opts)
    
    '''
    couldn't limit to 500 results because it breaks referential integrity
    '''
    sql = 'SELECT titleID, genres FROM datamart.movie_title'
    bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)
    
    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)
    
    has_genre_pcoll = query_results | 'Make Genres Junction Table' >> beam.ParDo(MakeHasGenre())
    
    has_genre_pcoll | 'has_genre results' >> WriteToText('output.txt')
    
    dataset_id = 'datamart'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'has_genre_Beam'
    schema_id = 'titleID:STRING,genreID:STRING'
    
    has_genre_pcoll | 'Write has_genre to BQ' >> WriteToBigQuery(table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)
    
    
    result = p.run()
    result.wait_until_finish()
    
if __name__ == '__main__':
    logging.getLogger().setLevel(logging.ERROR)
    run()

  temp_location = pcoll.pipeline.options.view_as(
  temp_location = p.options.view_as(GoogleCloudOptions).temp_location


In [2]:
import logging, re
import apache_beam as beam
from apache_beam.io import WriteToText
from apache_beam.io.gcp.bigquery import ReadFromBigQuery, WriteToBigQuery


'''
5/6/2021
The following ParDo 'MakeGenre' function takes in a dictionary containing a genre from the previously made has_genre_Beam table.
It simply creates a record of genreID and genre and returns it.
The FARM_FINGERPRINT method would not work for the creation of a unqiue id for genreID,
so that was performed after in bigquery with MD5.
'''
class MakeGenre(beam.DoFn):
    def process(self, element):
        g = element['genreID']
        if g is not None:
            record = {'genreID': g, 'genre': g}
            return [record]


def run():
    PROJECT_ID = 'coastal-well-303101'
    BUCKET = 'gs://allnaturalbrandy2021/temp'
    
    options = {
        'project': PROJECT_ID
    }
    
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    p = beam.Pipeline('DirectRunner', options=opts)
    
    '''
    couldn't limit to 500 results because it breaks referential integrity
    '''
    sql = 'SELECT DISTINCT genreID FROM datamart.has_genre_Beam'
    bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)
    
    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)
    
    genre_pcoll = query_results | 'Make Genre Table' >> beam.ParDo(MakeGenre())
    
    genre_pcoll | 'genre results' >> WriteToText('output.txt')

    dataset_id = 'datamart'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'genre_Beam'
    schema_id = 'genreID:STRING,genre:STRING'
    
    genre_pcoll | 'Write genre to BQ' >> WriteToBigQuery(table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)
    
    
    result = p.run()
    result.wait_until_finish()
    
if __name__ == '__main__':
    logging.getLogger().setLevel(logging.ERROR)
    run()

5/6/2021
Making the unique genreID for the has_genre table in bigquery because the FARM_FINGERPRINT method wasn't working in apache beam

In [1]:
%%bigquery
UPDATE datamart.has_genre_Beam SET genreID = TO_BASE64(MD5(genreID))
WHERE genreID is NOT NULL

In [2]:
%%bigquery
SELECT *
FROM datamart.has_genre_Beam
LIMIT 10

Unnamed: 0,titleID,genreID
0,tt7790104,q0sUgssro/Huf9kgHIofrQ==
1,tt4661864,q0sUgssro/Huf9kgHIofrQ==
2,tt13674766,l6CehafhsQNAnvyT76T+0Q==
3,tt0047942,Be/XJhWFrYDFZd0WPdJQZQ==
4,tt8348492,eatwvuFWL8cO/I7gp9+Z3g==
5,tt0200437,eatwvuFWL8cO/I7gp9+Z3g==
6,tt7444348,eatwvuFWL8cO/I7gp9+Z3g==
7,tt1062254,eatwvuFWL8cO/I7gp9+Z3g==
8,tt0650734,l6CehafhsQNAnvyT76T+0Q==
9,tt6703230,eatwvuFWL8cO/I7gp9+Z3g==


5/6/2021
Making the unique genreID for the genre table in bigquery because the FARM_FINGERPRINT method wasn't working in apache beam

In [3]:
%%bigquery
UPDATE datamart.genre_Beam SET genreID = TO_BASE64(MD5(genreID))
WHERE genreID is NOT NULL

In [4]:
%%bigquery
SELECT *
FROM datamart.genre_Beam
LIMIT 10

Unnamed: 0,genreID,genre
0,ubugNXvD+okp0UsfNKXjXg==,religious
1,wmKxjmFKXvQVHQbm9Sh+Ug==,Romance
2,dwsWjmDT3e0AzmDdUe2C6Q==,Period action/adventure
3,zvxPfLyMNHYuD3ZwPn4XTg==,Family
4,bpOlt3LkfC86g0DTAg8/TA==,Drama
5,oYPlJ7+wT1YsIcnGL9EwXQ==,erotic
6,kHwdl0nxCYp9LobhCIvBLA==,swashbuckling
7,xA61FV83ph39653iHaHxUQ==,sentiment
8,IlCZgpXdTHK5xs9VSv7vDg==,folklore
9,EUHOZPJr3pXclvl6TMranA==,Biographical Drama


In [5]:
%%bigquery
SELECT COUNT(*), COUNT(DISTINCT titleID)
FROM datamart.movie_title

Unnamed: 0,f0_,f1_
0,7821715,7821715


In [6]:
%%bigquery
SELECT *
FROM datamart.movie_title AS m
FULL OUTER JOIN datamart.has_genre_Beam AS hg
ON m.titleID = hg.titleID
WHERE m.titleID IS NULL

Unnamed: 0,titleID,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runTimeMinutes,genres,titleID_1,genreID


In [7]:
%%bigquery
SELECT g.genreID AS genreID, hg.genreID AS has_genreID
FROM datamart.genre_Beam AS g
FULL OUTER JOIN datamart.has_genre_Beam AS hg
ON g.genreID = hg.genreID
WHERE g.genreID IS NULL

Unnamed: 0,genreID,has_genreID


In [8]:
%%bigquery
SELECT COUNT(*), COUNT(DISTINCT titleID || genreID)
FROM datamart.has_genre_Beam

Unnamed: 0,f0_,f1_
0,12421361,12421361


In [9]:
%%bigquery
SELECT COUNT(*), COUNT(DISTINCT genreID)
FROM datamart.genre_Beam

Unnamed: 0,f0_,f1_
0,234,234
