In [7]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                    .appName('helloSpark')
                    .getOrCreate()
        )

Further info on Spark sessions:  
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/spark_session.html

In [8]:
spark

In [9]:
# Let's generate some data for analysis

import random

names = ["Alice", "Ben", "Charles", "Daisy"]
start_range = 900
end_range = 5000
python_data = [[random.choice(names),random.randint(start_range,end_range)] for i in range(500000)]

In [10]:
# To read in a Python object (list, dict), we can use spark.createDataFrame
# We define a schema to have nicer column names and avoid Spark having to infer the schema
schema = "name STRING, salary INT"

df = spark.createDataFrame(python_data,schema=schema)

In [14]:
# to display some rows, you can use .show() 
df.show()

+-------+------+
|   name|salary|
+-------+------+
|  Alice|  1525|
|Charles|  2539|
|Charles|  1567|
|    Ben|  4256|
|  Alice|  1578|
|Charles|  1958|
|  Daisy|  1523|
|Charles|   910|
|  Daisy|  1232|
|    Ben|  2907|
|Charles|  2945|
|Charles|  3837|
|Charles|  1644|
|Charles|  1381|
|Charles|  2296|
|  Daisy|  2964|
|  Daisy|  3046|
|  Alice|  4287|
|  Alice|  2111|
|  Alice|  4366|
+-------+------+
only showing top 20 rows



In [15]:
df_new = (df.groupBy("name")
          .avg("salary")
         )

In [16]:
df_new.show()

+-------+------------------+
|   name|       avg(salary)|
+-------+------------------+
|Charles| 2955.559039107582|
|    Ben| 2953.736564242023|
|  Alice|2948.4433056398784|
|  Daisy| 2945.029207766338|
+-------+------------------+



In [None]:
# Many of the functions hide behind spark.sql.functions
import pyspark.sql.functions as F

(df_new.select(
    "name"
    ,"avg(salary)"
    ,F.round("avg(salary)").alias("average")
    ).show()
)

The following is for a comparison with the popular Python package `pandas`

In [17]:
import pandas as pd 

pd_df = pd.DataFrame(python_data,columns=["name","salary"])

In [18]:
pd_df

Unnamed: 0,name,salary
0,Alice,1525
1,Charles,2539
2,Charles,1567
3,Ben,4256
4,Alice,1578
...,...,...
499995,Daisy,4527
499996,Alice,2042
499997,Daisy,4492
499998,Alice,3929


In [19]:
pd_df.groupby("name").mean("salary")

Unnamed: 0_level_0,salary
name,Unnamed: 1_level_1
Alice,2948.443306
Ben,2953.736564
Charles,2955.559039
Daisy,2945.029208


Which of these seemed to be faster?  
Why?

Let's have a quick walkthrough of a few more PySpark methods.  
For a longer (full) list of methods, see:  
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html

In [50]:
# load CSV files into DataFrames
employees_df = spark.read.option("header", "true").csv("input/employees.csv")
departments_df = spark.read.option("header", "true").csv("input/departments.csv")

employees_df.show()
departments_df.show()

+------+-----+----------+------+
|emp_id| name|department|salary|
+------+-----+----------+------+
|     1| John|        HR| 50000|
|     2|Alice|        IT| 60000|
|     3|  Bob|   Finance| 55000|
+------+-----+----------+------+

+----------+-------+
|department|manager|
+----------+-------+
|        HR|   Anna|
|        IT|  David|
|   Finance|  Sarah|
+----------+-------+



In [51]:

#convert salary from string to integer for proper filtering
employees_df = employees_df.withColumn("salary", F.col("salary").cast("integer"))

employees_df.show()

+------+-----+----------+------+
|emp_id| name|department|salary|
+------+-----+----------+------+
|     1| John|        HR| 50000|
|     2|Alice|        IT| 60000|
|     3|  Bob|   Finance| 55000|
+------+-----+----------+------+



In [52]:

# use .filter() for ... filtering.
filtered_df = employees_df.filter(F.col("salary") > 55000)

# use .select() for ... selecting (columns).
selected_df = filtered_df.select("emp_id", "name", "department", "salary")

# use withColumnRenamed for renaming columns
renamed_df = selected_df.withColumnRenamed("emp_id", "employee_id")

# use selectExpr() for projecting SQL expressions
expr_df = renamed_df.selectExpr("employee_id", "name", "department", "salary", "salary / 12 as monthly_salary")

# use .join() for ... joining. Let's join with departments DataFrame on the 'department' column
joined_df = expr_df.join(departments_df, on="department", how="inner")


In [53]:

# .write for writing. There are multiple more options you can see in the next classes.
joined_df.write.mode("overwrite").option("header", "true").csv("output/joined_employees")

# NB - Spark has lazy evaluation. It will only execute the code when it needs to.
# This means that you can chain multiple transformations and actions together without any performance hit.
# The code will only be executed when you call an action like .show() or .write().

In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round

# Initialize Spark Session
spark = SparkSession.builder.appName("ThailandTourismAnalysis").getOrCreate()

# Read Parquet File
df = spark.read.parquet("thailand_domestic_tourism_2019_2023_ver2.parquet")

# Display Schema
df.printSchema()

# Show Sample Data
df.show(5)

# Exploratory Data Analysis
print(f"Total records: {df.count()}")
df.describe().show()

# Assuming the correct columns are 'variable' and 'value'
# Pivot the DataFrame to get the required columns
pivot_df = df.groupBy("date", "province_eng").pivot("variable").sum("value")

# Calculate Required Aggregations
aggregated_df = pivot_df.withColumn(
    "no_percentage_of_foreign_tourists",
    round(col("no_tourist_foreign") / col("no_tourist_all"), 4)
).withColumn(
    "revenue_percentage_of_foreign_tourists",
    round(col("revenue_foreign") / col("revenue_all"), 4)
).select(
    "date", "province_eng", "no_percentage_of_foreign_tourists", "revenue_percentage_of_foreign_tourists"
)

# Show Aggregated Data
aggregated_df.show(5)

# Write to JSON
aggregated_df.write.mode("overwrite").json("output/tourism_aggregated.json")

# Stop Spark Session
spark.stop()

root
 |-- date: timestamp_ntz (nullable = true)
 |-- province_thai: string (nullable = true)
 |-- province_eng: string (nullable = true)
 |-- region_thai: string (nullable = true)
 |-- region_eng: string (nullable = true)
 |-- variable: string (nullable = true)
 |-- value: double (nullable = true)

+-------------------+---------------+--------------------+-----------+----------+------------------+-----+
|               date|  province_thai|        province_eng|region_thai|region_eng|          variable|value|
+-------------------+---------------+--------------------+-----------+----------+------------------+-----+
|2019-01-01 00:00:00|  กรุงเทพมหานคร|             Bangkok|    ภาคกลาง|   central|ratio_tourist_stay|93.37|
|2019-01-01 00:00:00|         ลพบุรี|            Lopburi |    ภาคกลาง|   central|ratio_tourist_stay|61.32|
|2019-01-01 00:00:00|พระนครศรีอยุธยา|Phra Nakhon Si Ay...|    ภาคกลาง|   central|ratio_tourist_stay|73.37|
|2019-01-01 00:00:00|        สระบุรี|           Saraburi |