In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Broadcasting Example").getOrCreate()

In [4]:
sc = spark.sparkContext

In [5]:
df = spark.read.csv("uspopulation.csv",sep='|',header=True)

In [6]:
df.show(5,truncate =0)

+---------+-----------+----------+-------------+-----------+------------------+
|2019_rank|City       |State_Code|2019_estimate|2010_Census|Change            |
+---------+-----------+----------+-------------+-----------+------------------+
|1        |New York[d]|NY        |8336817      |8175133    |0.0198            |
|2        |Los Angeles|CA        |3979576      |3792621    |0.0493            |
|3        |Chicago    |IL        |2693976      |2695598    |−0.06%            |
|4        |Houston[3] |TX        |2320268      |2100263    |0.1048            |
|5        |Phoenix    |AZ        |1680992      |1445632    |0.1628            |
+---------+-----------+----------+-------------+-----------+------------------+
only showing top 5 rows



### Lets create lookup dictionary , which will keep record of abbreviation of State_code , 
### and we will broadcast it

In [7]:
lookup =dict({
"TX":"Texas",
"NY":"New York",
"OH":"OHIO",
"CA" :"California",
"IL":"Illionis",
"CO":"Colorado",
"AZ":"ARIZONA"
})

In [8]:
lookup

{'TX': 'Texas',
 'NY': 'New York',
 'OH': 'OHIO',
 'CA': 'California',
 'IL': 'Illionis',
 'CO': 'Colorado',
 'AZ': 'ARIZONA'}

In [9]:
broa_cast = sc.broadcast(lookup)

In [10]:
broa_cast.value["TX"]

'Texas'

In [19]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import *

In [16]:
def broadval(col_name):
    return broa_cast.value[col_name]

In [17]:
get_full_name = udf(broadval)

In [21]:
df.withColumn("state",get_full_name("State_Code")).show(10,truncate=0)

+---------+-----------+----------+-------------+-----------+------------------+----------+
|2019_rank|City       |State_Code|2019_estimate|2010_Census|Change            |state     |
+---------+-----------+----------+-------------+-----------+------------------+----------+
|1        |New York[d]|NY        |8336817      |8175133    |0.0198            |New York  |
|2        |Los Angeles|CA        |3979576      |3792621    |0.0493            |California|
|3        |Chicago    |IL        |2693976      |2695598    |−0.06%            |Illionis  |
|4        |Houston[3] |TX        |2320268      |2100263    |0.1048            |Texas     |
|5        |Phoenix    |AZ        |1680992      |1445632    |0.1628            |ARIZONA   |
|6        |San Antonio|TX        |1547253      |1327407    |0.1656            |Texas     |
|7        |San Diego  |CA        |1423851      |1307402    |0.0891            |California|
|8        |Dallas     |TX        |1343573      |1197816    |0.1217            |Texas     |

In [22]:
df.withColumn("state",get_full_name("State_Code")).explain()

== Physical Plan ==
*(1) Project [2019_rank#16, City#17, State_Code#18, 2019_estimate#19, 2010_Census#20, Change#21, pythonUDF0#269 AS state#261]
+- BatchEvalPython [broadval(State_Code#18)], [pythonUDF0#269]
   +- FileScan csv [2019_rank#16,City#17,State_Code#18,2019_estimate#19,2010_Census#20,Change#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/atif/PycharmProjects/test/spark_codes/spark_learn/uspopulation.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<2019_rank:string,City:string,State_Code:string,2019_estimate:string,2010_Census:string,Cha...


