In [2]:
# PySpark Basics Tutorial for Students

# 1. Setup & SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [3]:
# 2. Create DataFrame / Read CSV
# Create a bigger DataFrame with more missing values
data = [
("Ali", 25, "M"), ("Sara", 30, "F"), ("John", 22, "M"),
("Emma", None, "F"), ("Mike", 28, None), ("Lily", None, "F"),
("Tom", 35, "M"), (None, 40, "M"), ("Anna", 27, None), ("David", None, None)
]
columns = ["Name", "Age", "Gender"]
df = spark.createDataFrame(data, columns)
df.show()

+-----+----+------+
| Name| Age|Gender|
+-----+----+------+
|  Ali|  25|     M|
| Sara|  30|     F|
| John|  22|     M|
| Emma|NULL|     F|
| Mike|  28|  NULL|
| Lily|NULL|     F|
|  Tom|  35|     M|
| NULL|  40|     M|
| Anna|  27|  NULL|
|David|NULL|  NULL|
+-----+----+------+



In [4]:
# 3. Select Columns
df.select("Name", "Age").show()

+-----+----+
| Name| Age|
+-----+----+
|  Ali|  25|
| Sara|  30|
| John|  22|
| Emma|NULL|
| Mike|  28|
| Lily|NULL|
|  Tom|  35|
| NULL|  40|
| Anna|  27|
|David|NULL|
+-----+----+



In [10]:
# 4. Filter Rows
from pyspark.sql.functions import col

df.filter(col("Age") > 25).show()                    # Age > 25
df.filter((col("Age") > 20) & (col("Gender") == "F")).show()   # Age > 20 AND Gender == F
df.filter((col("Age") < 30) | (col("Gender") == "M")).show()   # Age < 30 OR Gender == M


+----+------------+------+
|Name|Age_original|Gender|
+----+------------+------+
|Sara|          30|     F|
|Mike|          28|  NULL|
| Tom|          35|     M|
|NULL|          40|     M|
|Anna|          27|  NULL|
+----+------------+------+

+----+------------+------+
|Name|Age_original|Gender|
+----+------------+------+
|Sara|          30|     F|
+----+------------+------+

+----+------------+------+
|Name|Age_original|Gender|
+----+------------+------+
| Ali|          25|     M|
|John|          22|     M|
|Mike|          28|  NULL|
| Tom|          35|     M|
|NULL|          40|     M|
|Anna|          27|  NULL|
+----+------------+------+



In [6]:
# 5. Add, Rename, Drop Columns
df = df.withColumn("Age_plus_5", col("Age") + 5)
df = df.withColumnRenamed("Age", "Age_original")
df = df.drop("Age_plus_5")
df.show()

+-----+------------+------+
| Name|Age_original|Gender|
+-----+------------+------+
|  Ali|          25|     M|
| Sara|          30|     F|
| John|          22|     M|
| Emma|        NULL|     F|
| Mike|          28|  NULL|
| Lily|        NULL|     F|
|  Tom|          35|     M|
| NULL|          40|     M|
| Anna|          27|  NULL|
|David|        NULL|  NULL|
+-----+------------+------+



In [13]:
missing_values_count = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
missing_values_count.show()

+----+------------+------+
|Name|Age_original|Gender|
+----+------------+------+
|   1|           3|     3|
+----+------------+------+



In [21]:
# 6. Handle Missing Values


from pyspark.sql.functions import col, mean, count, when


# 6-1) Check missing values
missing_values_count = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
missing_values_count.show()


# 6-2) Drop rows with any missing values
df_drop = df.dropna()
print("After dropping missing rows:")
df_drop.show()

# 6-3) Fill missing numeric values with mean

mean_age = df.select(mean("Age_original")).first()[0]
print("Mean of Age:", mean_age)
df_fill_numeric = df.fillna({"Age_original": mean_age})
print("After filling numeric missing values with mean:")
df_fill_numeric.show()

# 6-4) Fill missing categorical values with a suitable value
# Here we can use 'Unknown' for missing names and gender
df_fill_all = df_fill_numeric.fillna({"Name": "Unknown", "Gender": "Unknown"})
print("After filling categorical missing values:")
df_fill_all.show()

# 6-5) Alternative simple method: fill all missing values with constants
df_fill_const = df.fillna({"Name": "NoName", "Age_original": 0, "Gender": "U"})
print("After filling all missing with constants:")
df_fill_const.show()


+----+------------+------+
|Name|Age_original|Gender|
+----+------------+------+
|   1|           3|     3|
+----+------------+------+

After dropping missing rows:
+----+------------+------+
|Name|Age_original|Gender|
+----+------------+------+
| Ali|          25|     M|
|Sara|          30|     F|
|John|          22|     M|
| Tom|          35|     M|
+----+------------+------+

Mean of Age: 29.571428571428573
After filling numeric missing values with mean:
+-----+------------+------+
| Name|Age_original|Gender|
+-----+------------+------+
|  Ali|          25|     M|
| Sara|          30|     F|
| John|          22|     M|
| Emma|          29|     F|
| Mike|          28|  NULL|
| Lily|          29|     F|
|  Tom|          35|     M|
| NULL|          40|     M|
| Anna|          27|  NULL|
|David|          29|  NULL|
+-----+------------+------+

After filling categorical missing values:
+-------+------------+-------+
|   Name|Age_original| Gender|
+-------+------------+-------+
|    Ali| 

In [24]:
# 7. Sort

df.orderBy("Age_original",descending=False).show()

+-----+------------+------+
| Name|Age_original|Gender|
+-----+------------+------+
| Lily|        NULL|     F|
| Emma|        NULL|     F|
|David|        NULL|  NULL|
| John|          22|     M|
|  Ali|          25|     M|
| Anna|          27|  NULL|
| Mike|          28|  NULL|
| Sara|          30|     F|
|  Tom|          35|     M|
| NULL|          40|     M|
+-----+------------+------+



In [33]:
# 8. GroupBy & Aggregate
data2 = [
("Math", "Ali", 80), ("Math", "Sara", 90), ("English", "Ali", 85),
("English", "Sara", 95), ("Math", "Mike", 70), ("English", "Emma", 88)
]
columns2 = ["Subject", "Name", "Score"]
df2 = spark.createDataFrame(data2, columns2)

from pyspark.sql.functions import avg, sum, min, max, count, countDistinct
# GroupBy with agg dictionary
agg_dict = {"Score": "avg"}    #can be replaced with min, max, count
df2.groupBy("Subject").agg(agg_dict).show()


+-------+-----------------+
|Subject|       avg(Score)|
+-------+-----------------+
|   Math|             80.0|
|English|89.33333333333333|
+-------+-----------------+



In [40]:
# 9. Joins
df1 = spark.createDataFrame([("Ali", 25), ("Sara", 30), ("Mike", 28)], ["Name", "Age"])
df1.show()
df2 = spark.createDataFrame([("Ali", "M"), ("John", "M"), ("Mike", "M")], ["Name", "Gender"])
df2.show()

# Inner Join
print("Inner Join:")
df1.join(df2, on="Name", how="inner").show()

# Left Join
print("Left Join:")
df1.join(df2, on="Name", how="left").show()

# Right Join
print("Right Join:")

df1.join(df2, on="Name", how="right").show()



+----+---+
|Name|Age|
+----+---+
| Ali| 25|
|Sara| 30|
|Mike| 28|
+----+---+

+----+------+
|Name|Gender|
+----+------+
| Ali|     M|
|John|     M|
|Mike|     M|
+----+------+

Inner Join:
+----+---+------+
|Name|Age|Gender|
+----+---+------+
| Ali| 25|     M|
|Mike| 28|     M|
+----+---+------+

Left Join:
+----+---+------+
|Name|Age|Gender|
+----+---+------+
| Ali| 25|     M|
|Mike| 28|     M|
|Sara| 30|  NULL|
+----+---+------+

Right Join:
+----+----+------+
|Name| Age|Gender|
+----+----+------+
| Ali|  25|     M|
|John|NULL|     M|
|Mike|  28|     M|
+----+----+------+



# **Excersize**

# 10. Exercise for Students
# 1. Create manually a CSV file with columns : CustomerID, Product, Quantity, Price and then read file into a dataframe
# 2. Filter customers who bought more than 5 items
# 3. Add a column "TotalCost" = Quantity * Price
# 4. Handle missing Quantity and Price by filling with avg, and conastant
# 5. Group by Product and calculate avg, sum, min, max, count of TotalCost
# 6. Join (inner, left, and  right) with another CSV of Product details (ProductID, Category)