In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/19 14:30:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/19 14:30:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
employees = [{"name": "John D.", "age": 30},
  {"name": "Alice G.", "age": 25},
  {"name": "Bob T.", "age": 35},
  {"name": "Eve A.", "age": 28}]

# Create a DataFrame containing the employees data
df = spark.createDataFrame(employees)
df.show()

                                                                                

+---+--------+
|age|    name|
+---+--------+
| 30| John D.|
| 25|Alice G.|
| 35|  Bob T.|
| 28|  Eve A.|
+---+--------+



In [4]:
r1 = df.collect()[0]
r1.name, r1.age

('John D.', 30)

In [5]:
avg_age = df.groupBy('name').avg('age')
avg_age.show()



+--------+--------+
|    name|avg(age)|
+--------+--------+
| John D.|    30.0|
|Alice G.|    25.0|
|  Bob T.|    35.0|
|  Eve A.|    28.0|
+--------+--------+



                                                                                

In [6]:
df.rdd.getNumPartitions()

12

In [7]:
avg_age.rdd.getNumPartitions()

1

In [8]:
from pyspark.sql.functions import avg

avg_age = df.groupBy('name').agg(avg('age').alias('average_age'))

avg_age.printSchema()

root
 |-- name: string (nullable = true)
 |-- average_age: double (nullable = true)



In [9]:
avg_age.show()

+--------+-----------+
|    name|average_age|
+--------+-----------+
| John D.|       30.0|
|Alice G.|       25.0|
|  Bob T.|       35.0|
|  Eve A.|       28.0|
+--------+-----------+



In [24]:
df = spark.read.json("people.json")

df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [25]:
df.show(truncate=False)


+----+-------+
|age |name   |
+----+-------+
|NULL|Michael|
|30  |Andy   |
|19  |Justin |
+----+-------+



In [27]:
df['name']

Column<'name'>

In [28]:
from pyspark.sql import functions as F
df.select(F.col("name"))

DataFrame[name: string]

In [30]:
df_names = df.select("name")
df_names.show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [31]:
df_age = df.select("age")
df_age.show()

+----+
| age|
+----+
|NULL|
|  30|
|  19|
+----+



In [32]:
df_adults= df_age.where(F.col("age") >= 18)
df_adults.show()

+---+
|age|
+---+
| 30|
| 19|
+---+



In [35]:
avg_by_name = df.groupBy("name").agg(F.avg("age").alias("average_age"))
avg_by_name.show()

# Example 2: multiple aggregations with aliases
stats_by_name = df.groupBy("name").agg(
    F.count("*").alias("count"),
    F.avg("age").alias("avg_age"),
    F.max("age").alias("max_age")
)
stats_by_name.show()

# Example 3: global aggregations (no groupBy) with aliases
global_stats = df.agg(
    F.count("*").alias("total_people"),
    F.avg("age").alias("overall_avg_age")
)
global_stats.show()

+-------+-----------+
|   name|average_age|
+-------+-----------+
|Michael|       NULL|
|   Andy|       30.0|
| Justin|       19.0|
+-------+-----------+

+-------+-----+-------+-------+
|   name|count|avg_age|max_age|
+-------+-----+-------+-------+
|Michael|    1|   NULL|   NULL|
|   Andy|    1|   30.0|     30|
| Justin|    1|   19.0|     19|
+-------+-----+-------+-------+

+------------+---------------+
|total_people|overall_avg_age|
+------------+---------------+
|           3|           24.5|
+------------+---------------+

+------------+---------------+
|total_people|overall_avg_age|
+------------+---------------+
|           3|           24.5|
+------------+---------------+



In [36]:
df.unpersist()

DataFrame[age: bigint, name: string]

In [39]:
from pyspark.sql import types as T

my_schema = T.StructType([
    T.StructField("name", T.StringType(), nullable=False),
    T.StructField("age", T.IntegerType(), nullable=False)
])
df_csv = spark.read.csv("people.csv", schema=my_schema, header=True)

In [40]:
df_csv.show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 34|
|    Bob| 45|
|Charlie| 29|
|  Diana| 28|
+-------+---+

