In [1]:
import pyspark
import re
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, from_json, col
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType

In [3]:
spark = SparkSession \
    .builder \
    .appName("Spark_data_clean") \
    .config("spark.master", "spark://rayiMac.modem:7077") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:3.2.2_0.18.0") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.num", "1") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/ray/.ivy2/cache
The jars for the packages stored in: /Users/ray/.ivy2/jars
com.crealytics#spark-excel_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3eba18e3-5123-4a2d-b1a3-6c32f27e6d73;1.0
	confs: [default]
	found com.crealytics#spark-excel_2.12;3.2.2_0.18.0 in central
	found org.apache.poi#poi;5.2.2 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.commons#commons-collections4;4.4 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found commons-io#commons-io;2.11.0 in central
	found com.zaxxer#SparseBitSet;1.2 in central
	found org.apache.logging.log4j#log4j-api;2.17.2 in central
	found org.apache.poi#poi-ooxml;5.2.2 in central
	found org.apache.poi#poi-ooxml-lite;5.2.2 in central
	found org.apache.xmlbeans#xmlbeans;5.0.3 in central
	found org.apache.commons#commons-compress;1.21 in central
	found com.github.virtuald#curvesapi;1.07 in central
	found com.norbi

In [4]:
# load csv file into dataframe
df_metadata = spark.read.csv("movies_metadata.csv", header=True)

                                                                                

In [5]:
# cache dataframe for further processing
df_metadata.cache()

DataFrame[adult: string, belongs_to_collection: string, budget: string, genres: string, homepage: string, id: string, imdb_id: string, original_language: string, original_title: string, overview: string, popularity: string, poster_path: string, production_companies: string, production_countries: string, release_date: string, revenue: string, runtime: string, spoken_languages: string, status: string, tagline: string, title: string, video: string, vote_average: string, vote_count: string]

## Functions 

In [6]:
def transformation_origin(input_df, input_key, col_name, col_alias, input_schema):
    col_name = str(col_name)
    col_alias = str(col_alias)
    primary_key = str(input_key)
    # select col_name in file, import column as json format and apply complex schema to column 
    input_df = input_df.withColumn(col_name, from_json(col(col_name), input_schema))
    # explode the column with primary key, rename the exploded result
    df_exploded = input_df.select(primary_key, explode(col(col_name)).alias(col_alias))
    return df_exploded

In [7]:
def transformation(input_df, input_key, col_name, input_schema):
    col_name = str(col_name)
    primary_key = str(input_key)
    # select col_name in file, import column as json format and apply complex schema to column 
    input_df = input_df.withColumn(col_name, from_json(col(col_name), input_schema))
    # explode the column with primary key, rename the exploded result
    df_exploded = input_df.select(primary_key, explode(col(col_name)))
    return df_exploded

## For Collection Column 

In [8]:
# belongs to collection column
collection_schema = ArrayType(
    StructType([
        StructField("id",StringType(), True),
        StructField("name",StringType(), True),
        StructField("poster_path",StringType(), True),
        StructField("backdrop_path",StringType(), True),
    ])
)

In [9]:
print(type(collection_schema))

<class 'pyspark.sql.types.ArrayType'>


In [10]:
# id is the primary key used for this dataframe
df_collection_result = transformation (df_metadata, "id", "belongs_to_collection", collection_schema)

In [11]:
df_collection_result.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------------------+
|   id|                 col|
+-----+--------------------+
|  862|{10194, Toy Story...|
|15602|{119050, Grumpy O...|
|11862|{96871, Father of...|
|  710|{645, James Bond ...|
|21032|{117693, Balto Co...|
| 9273|{3167, Ace Ventur...|
| 8012|{91698, Chili Pal...|
| 9598|{9435, Babe Colle...|
| 9312|{9818, Mortal Kom...|
|10530|{136214, Pocahont...|
|11525|{124935, The Lawn...|
|10634|{43563, Friday Co...|
|  755|{10924, From Dusk...|
|10874|{256377, The Mupp...|
|27793|{91430, The Never...|
| 9737|{14890, Bad Boys ...|
|  414|{120794, Batman C...|
| 5894|{439053, Brooklyn...|
| 8068|{9649, Mexico Tri...|
| 1572|{1570, Die Hard C...|
+-----+--------------------+
only showing top 20 rows



                                                                                

In [12]:
df_collection_columns= df_collection_result.select("id",col("col.id").alias("collection_id"), col("col.name").alias("collection_name"), col("col.poster_path").alias("collection_poster_path"), col("col.backdrop_path").alias("collection_backdrop_path"))

In [13]:
df_collection_columns.cache()

DataFrame[id: string, collection_id: string, collection_name: string, collection_poster_path: string, collection_backdrop_path: string]

In [14]:
df_collection_columns.show(truncate=False)

[Stage 2:>                                                          (0 + 1) / 1]

+-----+-------------+--------------------------------+--------------------------------+--------------------------------+
|id   |collection_id|collection_name                 |collection_poster_path          |collection_backdrop_path        |
+-----+-------------+--------------------------------+--------------------------------+--------------------------------+
|862  |10194        |Toy Story Collection            |/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg|/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg|
|15602|119050       |Grumpy Old Men Collection       |/nLvUdqgPgm3F85NMCii9gVFUcet.jpg|/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg|
|11862|96871        |Father of the Bride Collection  |/nts4iOmNnq7GNicycMJ9pSAn204.jpg|/7qwE57OVZmMJChBpLEbJEmzUydk.jpg|
|710  |645          |James Bond Collection           |/HORpg5CSkmeQlAolx3bKMrKgfi.jpg |/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg|
|21032|117693       |Balto Collection                |/w0ZgH6Lgxt2bQYnf1ss74UvYftm.jpg|/9VM5LiJV0bGb1st1KyHA3cVnO2G.jpg|
|9273 |3167         |Ace Ventura

                                                                                

## For Genres Column

In [15]:
df_metadata.select("genres").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------+
|genres                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------+
|[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]                             |
|[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]                            |
|[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]                                                             |
|[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]                                |
|[{'id': 35, 'name': 'Comedy'}]                                                                         

In [16]:
# genres column
genres_schema = ArrayType(
    StructType([
        StructField("id",StringType(), True),
        StructField("name",StringType(), True),
    ])
)

In [17]:
# use transformation function
df_genres_result = transformation(df_metadata, "id", "genres", genres_schema)

In [18]:
df_genres_result.show

<bound method DataFrame.show of DataFrame[id: string, col: struct<id:string,name:string>]>

In [19]:
df_genres_columns= df_genres_result.select("id",col("col.id").alias("genres_id"), col("col.name").alias("genres_name"))

In [20]:
df_genres_columns.show()

+-----+---------+-----------+
|   id|genres_id|genres_name|
+-----+---------+-----------+
|  862|       16|  Animation|
|  862|       35|     Comedy|
|  862|    10751|     Family|
| 8844|       12|  Adventure|
| 8844|       14|    Fantasy|
| 8844|    10751|     Family|
|15602|    10749|    Romance|
|15602|       35|     Comedy|
|31357|       35|     Comedy|
|31357|       18|      Drama|
|31357|    10749|    Romance|
|11862|       35|     Comedy|
|  949|       28|     Action|
|  949|       80|      Crime|
|  949|       18|      Drama|
|  949|       53|   Thriller|
|11860|       35|     Comedy|
|11860|    10749|    Romance|
|45325|       28|     Action|
|45325|       12|  Adventure|
+-----+---------+-----------+
only showing top 20 rows



## production companies

In [21]:
df_metadata.select("production_companies").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|production_companies                                                                                                                                                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{'name': 'Pixa

In [22]:
# production_companies column
companies_schema = ArrayType(
    StructType([
        StructField("name",StringType(), True),
        StructField("id",StringType(), True),
    ])
)

In [23]:
# use transformation function
df_result = transformation(df_metadata, "id", "production_companies", companies_schema)

In [24]:
df_companies_columns= df_result.select("id",col("col.id").alias("companies_id"), col("col.name").alias("companies_name"))

In [25]:
df_companies_columns.show()

+-----+------------+--------------------+
|   id|companies_id|      companies_name|
+-----+------------+--------------------+
|  862|           3|Pixar Animation S...|
| 8844|         559|    TriStar Pictures|
| 8844|        2550|        Teitler Film|
| 8844|       10201|Interscope Commun...|
|15602|        6194|        Warner Bros.|
|15602|       19464|      Lancaster Gate|
|11862|        5842|Sandollar Product...|
|11862|        9195| Touchstone Pictures|
|  949|         508| Regency Enterprises|
|  949|         675|        Forward Pass|
|  949|        6194|        Warner Bros.|
|11860|           4|  Paramount Pictures|
|11860|         258|Scott Rudin Produ...|
|11860|         932|  Mirage Enterprises|
|11860|        5842|Sandollar Product...|
|11860|       14941|Constellation Ent...|
|11860|       55873|           Worldwide|
|11860|       58079|Mont Blanc Entert...|
|45325|           2|Walt Disney Pictures|
| 9091|          33|  Universal Pictures|
+-----+------------+--------------

                                                                                

## production countries

In [26]:
df_metadata.select("production_countries").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|production_countries                                                                                                                                                              |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{'iso_3166_1': 'US', 'name': 'United States of America'}]                                                                                                                        |
|[{'iso_3166_1': 'US', 'name': 'United States of America'}]                                                                                                                        |
|[{'iso_3166_1': 'US', 'name': 'United States of America'}]                                    

In [27]:
# production_companies column
countries_schema = ArrayType(
    StructType([
        StructField("iso_3166_1",StringType(), True),
        StructField("name",StringType(), True),
    ])
)

In [34]:
# use transformation function
df_result = transformation(df_metadata, "id", "production_countries", countries_schema)
df_result.orderBy("id").show()

                                                                                

+--------------------+--------------------+
|                  id|                 col|
+--------------------+--------------------+
|              1978."|     {null, English}|
| Li Tiemei is det...|      {null, 普通话}|
| and insists that...|{US, United State...|
| triompheront-ils...|    {null, Français}|
|                 100|{GB, United Kingdom}|
|               10000|      {CO, Colombia}|
|               10000|        {FR, France}|
|               10000|         {IT, Italy}|
|               10001|     {AU, Australia}|
|              100010|{US, United State...|
|              100017|       {DE, Germany}|
|               10002|{GB, United Kingdom}|
|              100024|{US, United State...|
|              100024|        {CA, Canada}|
|               10003|{US, United State...|
|              100033|         {JP, Japan}|
|               10004|{US, United State...|
|              100042|{US, United State...|
|               10005|{US, United State...|
|              100057|{US, United S

23/09/12 00:59:57 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
23/09/12 00:59:57 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:978)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [30]:
df_cuntries_columns= df_result.select("id",col("col.iso_3166_1"), col("col.name").alias("countires_name"))

In [31]:
df_cuntries_columns.orderBy("id").show(truncate=False)



+----------------------------------------------------------------------------------------------------------+----------+------------------------+
|id                                                                                                        |iso_3166_1|countires_name          |
+----------------------------------------------------------------------------------------------------------+----------+------------------------+
| 1978."                                                                                                   |null      |English                 |
| Li Tiemei is determined to follow the example of her father and carry the revolution through to the end."|null      |普通话                  |
| and insists that they still be married                                                                   |US        |United States of America|
| triompheront-ils de la crise avec panache ?"                                                             |null      |Français      

                                                                                

## Spoken Languages

In [None]:
df_metadata.select("spoken_languages").show(truncate=False)