In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os

In [None]:
!apt-get update
!apt-get install openjdk-11-jdk -y
!pip install pyspark

In [None]:

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.11/dist-packages/pyspark"

In [None]:
!pip install pyngrok
from pyngrok import ngrok

In [None]:
#bring in Pyspark functions into your session
from pyspark.sql.functions import *
from pyspark.sql.functions import col

In [None]:
#Start our Pyspark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()


•	Reading and Writing Data

In [None]:


# Create simple dataframe
df = spark.createDataFrame([(1, "foo"), (2, "bar"), (1, "foo")], ["id", "value"])
df.show()

In [None]:
mydata = spark.read.format("csv").option("header","true").load("original.csv")

In [None]:
# prompt: import json file

import json
df2 = spark.read.json('people.json')
df2.show()




Data Checking/cleaning & *Validation*

In [None]:
mydata.printSchema()

In [None]:
#Exploratory Functions


df.describe()



In [None]:
df.summary()


In [None]:
mydata.columns


In [None]:
df2.dtypes

In [None]:
#Null Checking & Missing Value Analysis
mydata.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in mydata.columns
]).show()


mydata.filter(mydata.col.isNull())
#Drop Rows with Any Null
mydata.na.drop()
mydata.na.fill(0)




In [None]:
#Replace Nulls Only in Selected Columns

df_filled = df2.fillna(0, subset=['age'])
df_filled .show()

In [None]:
# Duplicate Detection


df.groupBy(df.columns) \
  .count() \
  .filter("count > 1") \
  .show()




df.dropDuplicates().show()

df_drop= df.dropDuplicates()
df_drop.show()

Data Mining & Feature Engineering

In [None]:
#Filtering and Conditions
filter()



chained conditions using &, |, ~

In [None]:

from pyspark.sql.functions import col


In [None]:
data = [
    (1, "Alice", 29),
    (2, "Bob", 35),
    (3, "Eve", 25),
]

df = spark.createDataFrame(data, ["id", "name", "age"])

# Simple condition: age > 30 OR name == "Eve"
filtered_df = df.filter((col("age") > 30) | (col("name") == "Eve"))



In [None]:
filtered_df.show()

In PySpark, you can use chained conditions with the bitwise operators:
•	& for AND
•	| for OR
•	~ for NOT
These are used inside filter() or where() clauses, and each condition must be enclosed in parentheses to avoid operator precedence issues.


In [None]:

from pyspark.sql.functions import col



In [None]:
# Sample data
data = [
    (1, "Alice", 29),
    (2, "Bob", 35),
    (3, "Charlie", 30),
    (4, "Diana", 40),
    (5, "Eve", 25)
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "name", "age"])
df.show()


In [None]:
# Chained filter with &, |, ~
# Show me those who are over 30 years of age and excluded anyone named Diana,
filtered_df = df.filter(
    ((col("age") > 30) & (col("name") != "Diana"))
)

# Show result
filtered_df.show()

filtered_df = df.filter(
    ((col("age") > 30) & (col("name") != "Diana")) | (~(col("age") < 30))
)

# Show result
filtered_df.show()