# Start Spark Session & Import data form Google Cloud Bucket

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GCSMarvel")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://marvel_data_onno/marvel_clean.csv'  #  use your gcp bucket name. Also upload sales.csv first
# Create data frame
df = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)
df.printSchema()

newDf = df.select('Title')  # select one column

newDf.show()

df.show()

root
 |-- Title: string (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- ReleaseDateUS: string (nullable = true)
 |-- Budget: string (nullable = true)
 |-- OpeningWeekendNorthAmerica: string (nullable = true)
 |-- NorthAmerica: string (nullable = true)
 |-- OtherTerritories: string (nullable = true)
 |-- Worldwide: string (nullable = true)

+--------------------+
|               Title|
+--------------------+
|     Howard the Duck|
|               Blade|
|               X-Men|
|            Blade II|
|          Spider-Man|
|           Daredevil|
|                  X2|
|                Hulk|
|        The Punisher|
|        Spider-Man 2|
|      Blade: Trinity|
|             Elektra|
|      Fantastic Four|
|X-Men: The Last S...|
|         Ghost Rider|
|        Spider-Man 3|
|Fantastic Four: R...|
|            Iron Man|
| The Incredible Hulk|
|  Punisher: War Zone|
+--------------------+
only showing top 20 rows

+--------------------+------------------+-------------------+-

# Data Preprocessing 
Pipeline 2: The pipeline is designed to find out which movies are most worth it to develop a sequel for. We aim to acquire this insight by looking into which movies have been the most profitable. This will be determined by identifying the top movies with the highest ratio of revenue to budget, just as in pipeline 1, as well as the revenue to budget delta. Firstly, this ratio and delta will be calculated, after which all movies will be sorted in descending order of these values. Finally, the top 5 movies are determined to be most worth developing a sequel for, from a financial standpoint.

In [None]:
#import fucntions
from pyspark.sql import functions as F

In [None]:
#Select relevant columns for budget dataframe
budgetDF = df.select("Title", "Distributor", "Budget", "Worldwide")

budgetDF.show()

budgetDF.printSchema()

+--------------------+------------------+---------+---------+
|               Title|       Distributor|   Budget|Worldwide|
+--------------------+------------------+---------+---------+
|     Howard the Duck|Universal Pictures| 37000000| 37962774|
|               Blade|   New Line Cinema| 45000000|131183530|
|               X-Men|  20th Century Fox| 75000000|296339527|
|            Blade II|   New Line Cinema| 54000000|155010032|
|          Spider-Man|     Sony Pictures|139000000|821708551|
|           Daredevil|  20th Century Fox| 78000000|179179718|
|                  X2|  20th Century Fox|110000000|407711549|
|                Hulk|Universal Pictures|137000000|245360480|
|        The Punisher|   Lionsgate Films| 33000000| 54700105|
|        Spider-Man 2|     Sony Pictures|200000000|788976453|
|      Blade: Trinity|   New Line Cinema| 65000000|128905366|
|             Elektra|  20th Century Fox| 43000000| 56681566|
|      Fantastic Four|  20th Century Fox|100000000|330579719|
|X-Men: 

## Rank based on revenue-budget ratios
Create a dataframe for the revenue-budget ratios.

In [None]:
#Calculate revenue/budget ratios
ratioDF = budgetDF.withColumn("Ratio", budgetDF["Worldwide"]  / budgetDF["Budget"]) 

ratioDF.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- Budget: string (nullable = true)
 |-- Worldwide: string (nullable = true)
 |-- Ratio: double (nullable = true)



### Rank top 5 movies based on ratios
We solve for: Give the name and distributor of the 5 most profitable marvel movies worldwide based on revenue/budget ratio.

In [None]:
#Extract movie title, distributor name, and ratio and sort movies in descending ratio order
bestMoviesRatioDF = ratioDF.select("Title", "Distributor", "Ratio") \
    .orderBy(ratioDF["Ratio"].desc())

#Show top 5 movies with highest ratio
bestMoviesRatioDF.limit(5).show()

+--------------------+--------------------+------------------+
|               Title|         Distributor|             Ratio|
+--------------------+--------------------+------------------+
|            Deadpool|    20th Century Fox|13.501947913793103|
|Spider-Man: No Wa...|       Sony Pictures|       9.262094295|
|               Venom|       Sony Pictures|        8.55013954|
|   Avengers: Endgame|Walt Disney Studi...| 7.858990348314607|
|      Captain Marvel|Walt Disney Studi...| 7.422860486842105|
+--------------------+--------------------+------------------+



**Conclusions**:
- `Deadpool` seems to be the most profitable movie with a revenue/budget ratio of 13.5. 
- Both Sony Pictures and Walt Disney Studios have high profitable movies when looking at the revenue/budget ratio.

### Rank best movie per distributor based on ratios
We solve for: Give the name of the most profitable marvel movie worldwide based on revenue/budget ratio for each distributor.

In [None]:
#Extract highest movie ratio for each distributor
bestDistributorRatioDF = ratioDF.groupBy("Distributor") \
    .agg(
        F.max("Ratio").alias("MaxRatio")) \
    .withColumnRenamed("Distributor", "BestDist")

#Combine the max ratio with the original movie name again
bestMovieDistributorRatioDF = bestDistributorRatioDF.join(ratioDF, (ratioDF.Distributor == bestDistributorRatioDF.BestDist) & 
         (ratioDF.Ratio == bestDistributorRatioDF.MaxRatio)) \
        .select( "Title", "Distributor", "Ratio") \
        .orderBy(F.desc("Ratio"))

bestMovieDistributorRatioDF.show()

+--------------------+--------------------+------------------+
|               Title|         Distributor|             Ratio|
+--------------------+--------------------+------------------+
|            Deadpool|    20th Century Fox|13.501947913793103|
|Spider-Man: No Wa...|       Sony Pictures|       9.262094295|
|   Avengers: Endgame|Walt Disney Studi...| 7.858990348314607|
|            Iron Man|  Paramount Pictures| 4.179815871428572|
|               Blade|     New Line Cinema|2.9151895555555556|
|                Hulk|  Universal Pictures| 1.790952408759124|
|        The Punisher|     Lionsgate Films|1.6575789393939393|
|     The New Mutants|20th Century Studios|0.7264935223880598|
+--------------------+--------------------+------------------+



**Conclusions**:
- Based on the previous results, we could already see 20th Century Fox, Sony Pictures and Walt Disney Studios are the best distributors.
- Universal Pictures and Lionsgate are the least profitable, while 20th Century Studios only makes losses.

## Rank based on revenue-budget delta
Create a dataframe for the revenue-budget deltas.

In [None]:
#Calculate revenue/budget delta
deltaDF = budgetDF.withColumn("Delta", budgetDF["Worldwide"]  - budgetDF["Budget"]) 

deltaDF.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- Budget: string (nullable = true)
 |-- Worldwide: string (nullable = true)
 |-- Delta: double (nullable = true)



### Rank top 5 movies based on deltas
We solve for: Give the name and distributor of the 5 most profitable marvel movies worldwide based on revenue/budget delta.

In [None]:
#Extract movie title, distributor name, and delta and sort movies in descending delta order
bestMoviesDeltaDF = deltaDF.select("Title", "Distributor", "Delta") \
    .orderBy(deltaDF["Delta"].desc())

#Show top 5 movies with highest delta
bestMoviesDeltaDF.limit(5).show()

+--------------------+--------------------+-------------+
|               Title|         Distributor|        Delta|
+--------------------+--------------------+-------------+
|   Avengers: Endgame|Walt Disney Studi...|2.441800564E9|
|Avengers: Infinit...|Walt Disney Studi...|1.732359754E9|
|Spider-Man: No Wa...|       Sony Pictures|1.652418859E9|
|        The Avengers|Walt Disney Studi...|1.298812988E9|
|Avengers: Age of ...|Walt Disney Studi...|1.155403694E9|
+--------------------+--------------------+-------------+



**Conclusions**:
- Avengers movies from Walt Disney Studios are very popular and thus makes billions of profit.

### Rank best movie per distributor based on deltas
We solve for: Give the name of the most profitable marvel movie worldwide based on revenue/budget delta for each distributor.

In [None]:
#Extract highest movie delta for each distributor
bestDistributorDeltaDF = deltaDF.groupBy("Distributor") \
    .agg(
        F.max("Delta").alias("MaxDelta")) \
    .withColumnRenamed("Distributor", "BestDist")

#Combine the max delta with the original movie name again
bestMovieDistributorDeltaDF = bestDistributorDeltaDF.join(deltaDF, (deltaDF.Distributor == bestDistributorDeltaDF.BestDist) & 
         (deltaDF.Delta == bestDistributorDeltaDF.MaxDelta)) \
        .select( "Title", "Distributor", "Delta") \
        .orderBy(F.desc("Delta"))

bestMovieDistributorDeltaDF.show()

+--------------------+--------------------+-------------+
|               Title|         Distributor|        Delta|
+--------------------+--------------------+-------------+
|   Avengers: Endgame|Walt Disney Studi...|2.441800564E9|
|Spider-Man: No Wa...|       Sony Pictures|1.652418859E9|
|            Deadpool|    20th Century Fox| 7.25112979E8|
|            Iron Man|  Paramount Pictures| 4.45174222E8|
| The Incredible Hulk|  Universal Pictures| 1.13427551E8|
|            Blade II|     New Line Cinema| 1.01010032E8|
|        The Punisher|     Lionsgate Films|  2.1700105E7|
|            Inhumans|  IMAX Entertainment|    2852282.0|
|     The New Mutants|20th Century Studios| -1.8324934E7|
+--------------------+--------------------+-------------+



**Conclusions**:
- Based on previous and current results, we could conclude Walt Disney Studios and Sony Pictures are the most succesfull distributors with the highest revenue/budget ratios and deltas.
- 20th Century Studios makes more than 18 million in losses.

# Save Processed data in BigQuery

In [None]:
# Merge ratios and deltas of movies in one dataframe for dashboard
mergedDF = bestMoviesRatioDF.join(bestMoviesDeltaDF['Title', 'Delta'], 'Title', 'inner')
mergedDF.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- Ratio: double (nullable = true)
 |-- Delta: double (nullable = true)



In [12]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "marvel_temp_onno"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
mergedDF.write.format('bigquery').option('table', 'gothic-talent-435511-s2.marvel.sequel').mode("append").save() 

In [13]:
spark.stop()