In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, from_json, window
from pyspark.sql.functions import sum as F_sum
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType, MapType, IntegerType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigqueryExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# We need to set the following configuration whenever we need to use GCS.
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de_jads"
spark.conf.set('temporaryGcsBucket', bucket)

# Load data from BigQuery.
df = spark.read \
  .format("bigquery") \
  .load("glass-sylph-325109.ass2.trades")

df.groupBy(window(col('time_frame.end'), '1 days'), col('taker_side'), col('broker'))\
  .agg({'size':'sum', 'count':'sum'})\
  .select(col('sum(size)').alias('size'), col('sum(count)').alias('count'), col('broker'), col('taker_side'), col('window').alias('day_frame'))\
  .withColumn('mean', col('size')/col('count'))\
  .write.format('bigquery')\
      .option('table', 'glass-sylph-325109.ass2.trades_by_day')\
      .mode("overwrite")\
      .save()

df.printSchema()

In [None]:
spark.stop()