In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [16]:
json_file_path = '/Users/nombauser/Desktop/GIT/MyGitRepos/Learn-PySpark/files/phone.json'
spark = SparkSession.builder.appName("Hey").getOrCreate()

In [17]:
df = (spark.read.format('json')
    .option("multiline", "true")
    .load(json_file_path)
)

In [18]:
df.printSchema()

root
 |-- brand_name: string (nullable = true)
 |-- devices: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- imageUrl: string (nullable = true)
 |    |    |-- model_name: string (nullable = true)
 |    |    |-- specifications: struct (nullable = true)
 |    |    |    |-- Battery: struct (nullable = true)
 |    |    |    |    |-- Charging: string (nullable = true)
 |    |    |    |    |-- Music play: string (nullable = true)
 |    |    |    |    |-- Stand-by: string (nullable = true)
 |    |    |    |    |-- Talk time: string (nullable = true)
 |    |    |    |    |-- Type: string (nullable = true)
 |    |    |    |-- Body: struct (nullable = true)
 |    |    |    |    |-- Build: string (nullable = true)
 |    |    |    |    |-- Dimensions: string (nullable = true)
 |    |    |    |    |-- Keyboard: string (nullable = true)
 |    |    |    |    |-- SIM: string (nullable = true)
 |    |    |    |    |-- Weight: string (nullable = true)
 |    |    | 

25/10/11 09:48:19 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 159278 ms exceeds timeout 120000 ms
25/10/11 09:48:19 WARN SparkContext: Killing executors is not supported by current scheduler.
25/10/11 09:52:19 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.B

In [None]:
col_list = ["brand_name", "credits.developer", "minimum_age", "collection_stats.for_trade", "credits.artists"]
df2 =  df.select(
    *col_list
)
df2.show()

+--------------------+--------------------+-----------+---------+--------------------+
|           boardgame|           developer|minimum_age|for_trade|             artists|
+--------------------+--------------------+-----------+---------+--------------------+
|   Brass: Birmingham|                null|         14|      278|[Gavan Brown, Lin...|
|Pandemic Legacy: ...|                null|         13|      509|   [Chris Quilliams]|
|            Ark Nova|                null|         14|      415|[Steffen Bieker, ...|
|          Gloomhaven|   [Marcel Dragomir]|         14|     1210|[Alexandr Elichev...|
|Twilight Imperium...|[Dane Beltrami, J...|         14|      212|   [Scott Schomburg]|
|      Dune: Imperium|       [Paul Dennen]|         14|      492|[Clay Brooks, Bre...|
|   Terraforming Mars|                null|         12|      809|[Isaac Fryxelius,...|
|War of the Ring: ...|                null|         13|      281|[John Howe, Fabio...|
|Star Wars: Rebellion|    [Steven Kimball]|

In [None]:
# fillna() with a single value
fill_na = df2.withColumn(
    "developer",
    when(col('developer').isNull(), array(lit("missing")).cast(ArrayType(StringType()))) # the .cast() is not necessarily needed here
    .otherwise(col("developer"))
)

fill_na.show(8,False)

+---------------------------------+-----------------------------+-----------+---------+---------------------------------------------------------------------------------------+
|boardgame                        |developer                    |minimum_age|for_trade|artists                                                                                |
+---------------------------------+-----------------------------+-----------+---------+---------------------------------------------------------------------------------------+
|Brass: Birmingham                |[missing]                    |14         |278      |[Gavan Brown, Lina Cossette, David Forest, Gui Landgraf, Damien Mammoliti, Matt Tolman]|
|Pandemic Legacy: Season 1        |[missing]                    |13         |509      |[Chris Quilliams]                                                                      |
|Ark Nova                         |[missing]                    |14         |415      |[Steffen Bieker, Loïc Billiau, De

In [14]:
# Fill NULL values by specifying column-specific values

df_fill_columns = df2.fillna({
    "boardgame": "Unknown",
    "minimum_age": 0
})

df_fill_columns.show(5,False)

+---------------------------------+-----------------------------+-----------+---------+---------------------------------------------------------------------------------------+
|boardgame                        |developer                    |minimum_age|for_trade|artists                                                                                |
+---------------------------------+-----------------------------+-----------+---------+---------------------------------------------------------------------------------------+
|Brass: Birmingham                |null                         |14         |278      |[Gavan Brown, Lina Cossette, David Forest, Gui Landgraf, Damien Mammoliti, Matt Tolman]|
|Pandemic Legacy: Season 1        |null                         |13         |509      |[Chris Quilliams]                                                                      |
|Ark Nova                         |null                         |14         |415      |[Steffen Bieker, Loïc Billiau, De

In [15]:
# Filling NULL values in a specific column only

df_fill_single_column = df2.fillna(
    "No game", subset=["boardgame"]
)

df_fill_single_column.show(4, False)

+-------------------------+-----------------+-----------+---------+---------------------------------------------------------------------------------------+
|boardgame                |developer        |minimum_age|for_trade|artists                                                                                |
+-------------------------+-----------------+-----------+---------+---------------------------------------------------------------------------------------+
|Brass: Birmingham        |null             |14         |278      |[Gavan Brown, Lina Cossette, David Forest, Gui Landgraf, Damien Mammoliti, Matt Tolman]|
|Pandemic Legacy: Season 1|null             |13         |509      |[Chris Quilliams]                                                                      |
|Ark Nova                 |null             |14         |415      |[Steffen Bieker, Loïc Billiau, Dennis Lohausen, Christof Tisch]                        |
|Gloomhaven               |[Marcel Dragomir]|14         |1210   