In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("sparksql").getOrCreate()

In [4]:
spark.version

'3.5.0'

In [6]:
data = spark.read.csv("operations_management.csv", header=True, inferSchema=True)
data.show(5)

+--------------------+--------------------+-----+---------------+---------+-----+
|         description|            industry|level|           size|line_code|value|
+--------------------+--------------------+-----+---------------+---------+-----+
|Awareness of clim...|               total|    0| 6–19 employees| C0300.01|13080|
|Awareness of clim...|               total|    0|20–49 employees| C0300.01| 3348|
|Awareness of clim...|               total|    0|50–99 employees| C0300.01| 1089|
|Awareness of clim...|               total|    0| 100+ employees| C0300.01| 1023|
|Awareness of clim...|Agriculture, fore...|    1|          total| C0300.01| 2364|
+--------------------+--------------------+-----+---------------+---------+-----+
only showing top 5 rows



In [8]:
data.printSchema()

root
 |-- description: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- level: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- value: integer (nullable = true)



### Applying Transformations to the data But the original data is immutable

In [17]:
data_2 = data.select("industry","value").\
         filter((data.value > 1000) & (data.industry != 'total') ).\
         orderBy(data.value.desc())

In [19]:
data_2.show(5)

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|        Construction| 6030|
|        Construction| 5904|
|        Construction| 5229|
|Accommodation & f...| 5058|
|        Construction| 4965|
+--------------------+-----+
only showing top 5 rows



In [23]:
data.createOrReplaceTempView("tempdata")

In [24]:
spark.sql("""SELECT industry, value 
FROM tempdata 
WHERE value > 1000 
AND industry != "total"
""").show(5)

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|Agriculture, fore...| 2364|
|         Agriculture| 1683|
|       Manufacturing| 1971|
|        Construction| 2685|
|     Wholesale trade| 1287|
+--------------------+-----+
only showing top 5 rows



### Difference between Local and Global temporary view

In [None]:
# Local = data.createOrReplaceTempView("localview") 
# Global = data.createOrReplaceGlobalTempView("globalview")

#### Local Viewa are visible to a single spark session within a spark application 
#### Global Views are visible to multiple spark session within a single spark application