In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os

In [2]:
!apt-get update
!apt-get install openjdk-11-jdk -y
!pip install pyspark

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,776 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,750 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jamm

In [3]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.11/dist-packages/pyspark"

In [4]:
!pip install pyngrok
from pyngrok import ngrok

Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.11-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.11


In [5]:
#bring in Pyspark functions into your session
from pyspark.sql.functions import *
from pyspark.sql.functions import col

In [6]:
#Start our Pyspark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [7]:

#create our data
data = [
    ("Alice", 25, "F"),
    ("Bob", 30, "M"),
    ("Charlie", 35, "M"),
    ("Diana", 40, "F")
]

columns = ["name", "age", "gender"]
df = spark.createDataFrame(data, columns)
df.show()


+-------+---+------+
|   name|age|gender|
+-------+---+------+
|  Alice| 25|     F|
|    Bob| 30|     M|
|Charlie| 35|     M|
|  Diana| 40|     F|
+-------+---+------+



# The basics

# Conditions with & (AND)

In [8]:
#Get people aged over 30 AND female:

df_filtered = df.filter( (col("age") > 30) & (col("gender") == "F") )
df_filtered.show()


+-----+---+------+
| name|age|gender|
+-----+---+------+
|Diana| 40|     F|
+-----+---+------+



# Conditions with | (OR)

In [10]:
#Get people aged over 30 OR female:

df_filtered = df.filter( (col("age") > 30) | (col("gender") == "F") )
df_filtered.show()


+-------+---+------+
|   name|age|gender|
+-------+---+------+
|  Alice| 25|     F|
|Charlie| 35|     M|
|  Diana| 40|     F|
+-------+---+------+



# NOT Condition with ~

In [11]:
df_filtered = df.filter( ~(col("gender") == "F") )
df_filtered.show()


+-------+---+------+
|   name|age|gender|
+-------+---+------+
|    Bob| 30|     M|
|Charlie| 35|     M|
+-------+---+------+



#Chained Conditions


In [12]:
#Create our data
data = [
    ("Alice", 25, "F", 60000),
    ("Bob", 30, "M", 48000),
    ("Charlie", 35, "M", 70000),
    ("Diana", 40, "F", 52000),
    ("Evan", 28, "M", 35000),
    ("Fiona", 32, "F", 75000)
]

columns = ["name", "age", "gender", "salary"]
df = spark.createDataFrame(data, columns)
df.show()

+-------+---+------+------+
|   name|age|gender|salary|
+-------+---+------+------+
|  Alice| 25|     F| 60000|
|    Bob| 30|     M| 48000|
|Charlie| 35|     M| 70000|
|  Diana| 40|     F| 52000|
|   Evan| 28|     M| 35000|
|  Fiona| 32|     F| 75000|
+-------+---+------+------+



# Show me who are over 30 years old and whose Salary > 50,000, also include any females who are under 30.


In [15]:

df_filtered = df.filter(
    ((col("age") > 30) & (col("salary") > 50000)) |
    ((col("gender") == "F") & (col("age") < 30))
)
df_filtered.show()


+-------+---+------+------+
|   name|age|gender|salary|
+-------+---+------+------+
|  Alice| 25|     F| 60000|
|Charlie| 35|     M| 70000|
|  Diana| 40|     F| 52000|
|  Fiona| 32|     F| 75000|
+-------+---+------+------+



# Show me anyhone whos name starts with 'A' or 'F' and  salary above 50,000


In [16]:
df_filtered = df.filter(
    (col("name").startswith("A") | col("name").startswith("F")) &
    (col("salary") > 50000)
)

df_filtered.show()


+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
|Alice| 25|     F| 60000|
|Fiona| 32|     F| 75000|
+-----+---+------+------+



# Filter rows where salary is not < 50,000 OR Age < 30

In [21]:


df_filtered = df.filter(
    ~((col("salary") < 50000) | (col("age") < 30))
)
df_filtered.show()


+-------+---+------+------+
|   name|age|gender|salary|
+-------+---+------+------+
|Charlie| 35|     M| 70000|
|  Diana| 40|     F| 52000|
|  Fiona| 32|     F| 75000|
+-------+---+------+------+



#Add a new column high_earner that is Yes if salary > 60,000 else No, then filter

In [17]:
df2 = df.withColumn(
    "high_earner",
    when(col("salary") > 60000, "Yes").otherwise("No")
)

df_filtered = df2.filter(col("high_earner") == "Yes")
df_filtered.show()


+-------+---+------+------+-----------+
|   name|age|gender|salary|high_earner|
+-------+---+------+------+-----------+
|Charlie| 35|     M| 70000|        Yes|
|  Fiona| 32|     F| 75000|        Yes|
+-------+---+------+------+-----------+



#Create a new column that categorizes age group, then fliter

In [20]:
df2 = df.withColumn(
    "age_group",
    when(col("age") < 30, "Young")
    .when((col("age") >= 30) & (col("age") <= 40), "Middle-aged")
    .otherwise("Senior")
)

df_filtered = df2.filter(col("age_group") == "Young")
df_filtered.show()


+-----+---+------+------+---------+
| name|age|gender|salary|age_group|
+-----+---+------+------+---------+
|Alice| 25|     F| 60000|    Young|
| Evan| 28|     M| 35000|    Young|
+-----+---+------+------+---------+

