# Start Spark Session & Import data form Google Cloud Bucket

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GCSMarvel")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://marvel_data/marvel.csv'  #  use your gcp bucket name. Also upload sales.csv first
# Create data frame
df = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)
df.printSchema()

newDf = df.select('Title')  # select one column

newDf.show()

df.show()

root
 |-- Title: string (nullable = true)
 |-- Distributor(s): string (nullable = true)
 |-- Release date(United States): string (nullable = true)
 |-- Bud�get (mil�lions): string (nullable = true)
 |-- Opening weekend(North America): string (nullable = true)
 |-- North America: string (nullable = true)
 |-- Other territories: string (nullable = true)
 |-- Worldwide: string (nullable = true)

+--------------------+
|               Title|
+--------------------+
|     Howard the Duck|
|               Blade|
|               X-Men|
|            Blade II|
|          Spider-Man|
|           Daredevil|
|                  X2|
|                Hulk|
|        The Punisher|
|        Spider-Man 2|
|      Blade: Trinity|
|             Elektra|
|      Fantastic Four|
|X-Men: The Last S...|
|         Ghost Rider|
|        Spider-Man 3|
|Fantastic Four: R...|
|            Iron Man|
| The Incredible Hulk|
|  Punisher: War Zone|
+--------------------+
only showing top 20 rows

+--------------------+----

# Data Preprocessing 

# Save Processed data in BigQuery

In [None]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "marvel_temp"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
df.write.format('bigquery') \
  .option('table', 'endless-mile-435507-h9.marvel.processed_data') \ # Change the project name before marvel to yours! You can find this when you click on DE2024
  .mode("append") \
  .save()