In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, round, count

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("StudentsAnalytics").getOrCreate()

In [None]:
# Step 2: Read CSV file into DataFrame
df = spark.read.csv("students.csv", header=True, inferSchema=True)

In [None]:
# 1. View first 5 rows
print("=== First 5 rows ===")
df.show(5)

In [None]:
# 2. Print schema
print("=== Schema ===")
df.printSchema()

In [None]:
# 3. Count total rows
print("Total rows:", df.count())

In [None]:
# 4. Show summary statistics (numeric columns)
print("=== Summary Statistics ===")
df.describe().show()

In [None]:
# 5. Select students with math >= 80
print("=== Students with math >= 80 ===")
df.filter(col("math") >= 80).select("id", "name", "math").show(10)

In [None]:
# 6. Calculate average marks per subject
print("=== Average marks per subject ===")
df.select(
    round(avg("math"),2).alias("avg_math"),
    round(avg("science"),2).alias("avg_science"),
    round(avg("english"),2).alias("avg_english")
).show()

In [None]:
# 7. Add new column: average marks
df_with_avg = df.withColumn("average", round((col("math")+col("science")+col("english"))/3,2))
print("=== Dataset with 'average' column ===")
df_with_avg.show(5)

In [None]:
# 8. Find topper (student with max average)
print("=== Topper ===")
df_with_avg.orderBy(col("average").desc()).limit(1).show()

In [None]:
# 9. Group by gender → average marks
print("=== Average marks by gender ===")
df_with_avg.groupBy("gender").agg(
    round(avg("math"),2).alias("avg_math"),
    round(avg("science"),2).alias("avg_science"),
    round(avg("english"),2).alias("avg_english"),
    round(avg("average"),2).alias("overall_avg")
).show()

In [None]:
# 10. Find min and max of each subject
print("=== Min & Max of each subject ===")
df.select(
    min("math").alias("min_math"), max("math").alias("max_math"),
    min("science").alias("min_science"), max("science").alias("max_science"),
    min("english").alias("min_english"), max("english").alias("max_english")
).show()

In [None]:
# Stop Spark session
# spark.stop()