In [16]:
import logging
import argparse

import apache_beam as beam
import apache_beam.io.gcp.bigquery as bq

In [17]:
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

In [18]:
N=9
p = beam.Pipeline(InteractiveRunner())
squares = (p | "Create Elements" >> beam.Create(range(N))
             | "Squares" >> beam.Map(lambda x: x**2))

In [19]:
ib.show_graph(p)

In [20]:
ib.show(squares)

In [95]:
PROJECT_ID ="my-bq-demo"
DATASET_ID="input"
TABLE_ID="input_for_transpose"
INPUT_SCHEMA = {
    'fields': [{
        'name': 'ID', 
        'type': 'INTEGER', 
        'mode': 'REQUIRED'
    }, 
    {
        'name': 'CLASS', 
        'type': 'STRING', 
        'mode': 'NULLABLE'
    },
    {
        'name': 'SALES', 
        'type': 'FLOAT', 
        'mode': 'NULLABLE'
        }]
        }
ID_SCHEMA = {'name': 'ID', 
             'type': 'INTEGER', 
              'mode': 'REQUIRED'
                            }
OUTPUT_SCHEMA ={}
key_field = ['ID']
pivot_field = ['CLASS']
value_field = ['SALES']
value_schema = {'SALES':'FLOAT'}
table_id = 'my-bq-demo:output.out111'

In [47]:
class GetPivotValues(beam.DoFn):
    def process(self, element):
        print(element)
        rt_set = set()
        row = element
        for field in pivot_field:
            rt_set = (field, row[field])
        yield rt_set

In [68]:
class FoldPivotValues(beam.DoFn):
    def process(self, element):
        rt_l = []
        rt_dict={}
        
        unique_field_values = set(element[1])
        for piv in unique_field_values:
                for val in value_schema:
                    rt_dict['name'] = f"{piv}_{val}"
                    rt_dict['type'] = value_schema[val]
                    rt_dict['mode'] = 'NULLABLE'
                    rt_l.append(rt_dict)
                    rt_dict={}
        print(rt_l)
        return rt_l

In [None]:
class NonKeySchema(beam.CombineFn):
  def create_accumulator(self):
    return {}

  def add_input(self, accumulator, input):
    accumulator.add(input)
    return accumulator

  def merge_accumulators(self, accumulators):
    merged = []
    for accum in accumulators:
      merged.append[accum]
    return merged

  def extract_output(self, accumulator):
        public Schema extractOutput(List<Schema> schemas) {
                    return PivotUtils.mergeSchemas(schemas);
                  }
    # accumulator == {'🥕': 3, '🍅': 6, '🍆': 1}
    total = sum(accumulator.values())  # 10
    percentages = {item: count / total for item, count in accumulator.items()}
    # percentages == {'🥕': 0.3, '🍅': 0.6, '🍆': 0.1}
    return percentages


In [74]:
def getDynamicMap(element, key_schema):
    print(element)

In [81]:
import apache_beam as beam

class PercentagesFn(beam.CombineFn):
  def create_accumulator(self):
    return {}

  def add_input(self, accumulator, input):
    # accumulator == {}
    # input == '🥕'
    if input not in accumulator:
      accumulator[input] = 0  # {'🥕': 0}
    accumulator[input] += 1  # {'🥕': 1}
    return accumulator

  def merge_accumulators(self, accumulators):
    # accumulators == [
    #     {'🥕': 1, '🍅': 2},
    #     {'🥕': 1, '🍅': 1, '🍆': 1},
    #     {'🥕': 1, '🍅': 3},
    # ]
    merged = {}
    for accum in accumulators:
      for item, count in accum.items():
        if item not in merged:
          merged[item] = 0
        merged[item] += count
    # merged == {'🥕': 3, '🍅': 6, '🍆': 1}
    return merged

  def extract_output(self, accumulator):
    # accumulator == {'🥕': 3, '🍅': 6, '🍆': 1}
    total = sum(accumulator.values())  # 10
    percentages = {item: count / total for item, count in accumulator.items()}
    # percentages == {'🥕': 0.3, '🍅': 0.6, '🍆': 0.1}
    return percentages

with beam.Pipeline() as pipeline:
  percentages = (
      pipeline
      | 'Create produce' >> beam.Create(
          ['🥕', '🍅', '🍅', '🥕', '🍆', '🍅', '🍅', '🍅', '🥕', '🍅'])
      | 'Get percentages' >> beam.CombineGlobally(PercentagesFn())
      | beam.Map(print))

{'🥕': 0.3, '🍅': 0.6, '🍆': 0.1}


In [96]:
with beam.Pipeline(InteractiveRunner()) as p:
        input_table_rows = ( p 
                    | "Read BigQuery table" >>  beam.Create(
                         [{"ID":123, "CLASS": "AAA", "SALES":101.44},
                        {"ID":123, "CLASS": "BBB", "SALES":345.44},
                        {"ID":1234, "CLASS": "AAA", "SALES":458.44}]
                        )
                    )
        
        pivoted_schema = ( input_table_rows
                        | "Get Pivot Schema" >> beam.ParDo(GetPivotValues()) 
                        | "Group by pivot field" >> beam.GroupByKey()
                        | "Fold pivot values to columns" >> beam.ParDo(FoldPivotValues())
                       )
        key_schema = (p 
                      | "Create Key Schema" >> beam.Create(
                          [ID_SCHEMA]
                      )
                     )
        dynamic_schema = ( 
                # 4) Create dynamic schema view for writing to output table.
                (pivoted_schema, key_schema)
                | "Convert dynamic schema map" >> beam.Flatten()
        )

{'ID': 123, 'CLASS': 'AAA', 'SALES': 101.44}
{'ID': 123, 'CLASS': 'BBB', 'SALES': 345.44}
{'ID': 1234, 'CLASS': 'AAA', 'SALES': 458.44}
[{'name': 'AAA_SALES', 'type': 'FLOAT', 'mode': 'NULLABLE'}, {'name': 'BBB_SALES', 'type': 'FLOAT', 'mode': 'NULLABLE'}]


In [91]:
ib.show(input_table_rows)

In [92]:
ib.show(pivoted_schema)

In [94]:
ib.show(dynamic_schema)

In [59]:
t = ("CLASS" , ['AAA', 'BBB', 'AAA'])

In [61]:
t[1]

['AAA', 'BBB', 'AAA']