In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os

In [None]:
!apt-get update
!apt-get install openjdk-11-jdk -y
!pip install pyspark

In [None]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.11/dist-packages/pyspark"

In [None]:
!pip install pyngrok
from pyngrok import ngrok

In [None]:
#bring in Pyspark functions into your session
from pyspark.sql.functions import *
from pyspark.sql.functions import col

In [None]:
#Start our Pyspark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:

#create our data
data = [
    ("Alice", 25, "F"),
    ("Bob", 30, "M"),
    ("Charlie", 35, "M"),
    ("Diana", 40, "F")
]

columns = ["name", "age", "gender"]
df = spark.createDataFrame(data, columns)
df.show()


# The basics

# Conditions with & (AND)

In [None]:
#Get people aged over 30 AND female:

df_filtered = df.filter( (col("age") > 30) & (col("gender") == "F") )
df_filtered.show()


# Conditions with | (OR)

In [None]:
#Get people aged over 30 OR female:

df_filtered = df.filter( (col("age") > 30) | (col("gender") == "F") )
df_filtered.show()


# NOT Condition with ~

In [None]:
df_filtered = df.filter( ~(col("gender") == "F") )
df_filtered.show()


#Chained Conditions


In [None]:
#Create our data
data = [
    ("Alice", 25, "F", 60000),
    ("Bob", 30, "M", 48000),
    ("Charlie", 35, "M", 70000),
    ("Diana", 40, "F", 52000),
    ("Evan", 28, "M", 35000),
    ("Fiona", 32, "F", 75000)
]

columns = ["name", "age", "gender", "salary"]
df = spark.createDataFrame(data, columns)
df.show()

# Show me who are over 30 years old and whose Salary > 50,000, also include any females who are under 30.


In [None]:

df_filtered = df.filter(
    ((col("age") > 30) & (col("salary") > 50000)) |
    ((col("gender") == "F") & (col("age") < 30))
)
df_filtered.show()


# Show me anyhone whos name starts with 'A' or 'F' and  salary above 50,000


In [None]:
df_filtered = df.filter(
    (col("name").startswith("A") | col("name").startswith("F")) &
    (col("salary") > 50000)
)

df_filtered.show()


# Filter rows where salary is not < 50,000 OR Age < 30

In [None]:


df_filtered = df.filter(
    ~((col("salary") < 50000) | (col("age") < 30))
)
df_filtered.show()


#Add a new column high_earner that is Yes if salary > 60,000 else No, then filter

In [None]:
df2 = df.withColumn(
    "high_earner",
    when(col("salary") > 60000, "Yes").otherwise("No")
)

df_filtered = df2.filter(col("high_earner") == "Yes")
df_filtered.show()


#Create a new column that categorizes age group, then fliter

In [None]:
df2 = df.withColumn(
    "age_group",
    when(col("age") < 30, "Young")
    .when((col("age") >= 30) & (col("age") <= 40), "Middle-aged")
    .otherwise("Senior")
)

df_filtered = df2.filter(col("age_group") == "Young")
df_filtered.show()
