#1. PySpark Setup & Initialization
##Exercise 1.1 – Setup Spark:

In [82]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BotCampus Intermediate Session").master("local[*]").getOrCreate()

##Exercise 1.2 – Load starter data:

In [83]:
data = [("Ananya", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



#2. RDDs & Transformations
##Exercise 2.1 – Create RDD from feedback:

In [84]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the mobile app",
"Meena from Delhi reported poor response time",
"Ajay from Pune liked the delivery speed",
"Ananya from Hyderabad had an issue with UI",
"Rohit from Mumbai gave positive feedback"
])

In [85]:
# Count total number of words.
word_count=feedback.flatMap(lambda line: line.split()).count()
print("Total number of words:", word_count)

Total number of words: 35


In [86]:
# Find top 3 most common words.
word_count = (feedback.flatMap(lambda line: line.split( )).map(lambda w: (w.lower(), 1)).reduceByKey(lambda a, b: a + b))
top3 = word_count.takeOrdered(3, key=lambda x: -x[1])
print("Top 3 xommon words: ", top3)

Top 3 xommon words:  [('from', 5), ('the', 2), ('loved', 1)]


In [87]:
# Remove stop words ( from , with , the , etc.).
stop_words = {"from", "with", "the", "an", "and", "had", "gave"}
filtered_words = feedback.flatMap(lambda line: line.lower().split()).filter(lambda w: w not in stop_words)
print(filtered_words.collect())

['ravi', 'bangalore', 'loved', 'mobile', 'app', 'meena', 'delhi', 'reported', 'poor', 'response', 'time', 'ajay', 'pune', 'liked', 'delivery', 'speed', 'ananya', 'hyderabad', 'issue', 'ui', 'rohit', 'mumbai', 'positive', 'feedback']


In [88]:
# Create a dictionary of word → count.
word_dict = dict(word_count.collect())
print(word_dict)

{'from': 5, 'loved': 1, 'app': 1, 'poor': 1, 'response': 1, 'liked': 1, 'speed': 1, 'ananya': 1, 'an': 1, 'issue': 1, 'with': 1, 'rohit': 1, 'mumbai': 1, 'positive': 1, 'feedback': 1, 'ravi': 1, 'bangalore': 1, 'the': 2, 'mobile': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'delivery': 1, 'hyderabad': 1, 'had': 1, 'ui': 1, 'gave': 1}


#3. DataFrames – Transformations
##Exercise 3.1 – Create exam_scores DataFrame:

In [89]:
scores = [
("Ravi", "Math", 88),
("Ananya", "Science", 92),
("Kavya", "English", 79),
("Ravi", "English", 67),
("Neha", "Math", 94),
("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

In [90]:
# Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).
from pyspark.sql.functions import when
df_scores = df_scores.withColumn("grade", when(df_scores.score >= 90, "A").when((df_scores.score >= 80) & (df_scores.score < 90), "B")
.when((df_scores.score >= 70) & (df_scores.score < 80), "C").otherwise("D"))
df_scores.show()

+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



In [91]:
# Group by subject, find average score.
from pyspark.sql.functions import avg
df_scores.groupBy("subject").avg("score").show()

+-------+----------+
|subject|avg(score)|
+-------+----------+
|Science|      88.5|
|   Math|      91.0|
|English|      73.0|
+-------+----------+



In [92]:
# Use when and otherwise to classify subject difficulty ( Math/Science = Difficult).
df_scores = df_scores.withColumn("difficulty", when(df_scores.subject.isin("Math", "Science"), "Difficult").otherwise("Easy"))
df_scores.show()

+------+-------+-----+-----+----------+
|  name|subject|score|grade|difficulty|
+------+-------+-----+-----+----------+
|  Ravi|   Math|   88|    B| Difficult|
|Ananya|Science|   92|    A| Difficult|
| Kavya|English|   79|    C|      Easy|
|  Ravi|English|   67|    D|      Easy|
|  Neha|   Math|   94|    A| Difficult|
| Meena|Science|   85|    B| Difficult|
+------+-------+-----+-----+----------+



In [93]:
# Rank students per subject using Window function.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
window_spec = Window.partitionBy("subject").orderBy(df_scores.score.desc())
df_scores = df_scores.withColumn("rank", rank().over(window_spec))
df_scores.show()

+------+-------+-----+-----+----------+----+
|  name|subject|score|grade|difficulty|rank|
+------+-------+-----+-----+----------+----+
| Kavya|English|   79|    C|      Easy|   1|
|  Ravi|English|   67|    D|      Easy|   2|
|  Neha|   Math|   94|    A| Difficult|   1|
|  Ravi|   Math|   88|    B| Difficult|   2|
|Ananya|Science|   92|    A| Difficult|   1|
| Meena|Science|   85|    B| Difficult|   2|
+------+-------+-----+-----+----------+----+



In [94]:
# Apply UDF to format names (e.g., make all uppercase).
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def upper_name(name):
    return name.upper()
upper_udf = udf(upper_name, StringType())
df_scores = df_scores.withColumn("name", upper_udf(df_scores.name))
df_scores.show()

+------+-------+-----+-----+----------+----+
|  name|subject|score|grade|difficulty|rank|
+------+-------+-----+-----+----------+----+
| KAVYA|English|   79|    C|      Easy|   1|
|  RAVI|English|   67|    D|      Easy|   2|
|  NEHA|   Math|   94|    A| Difficult|   1|
|  RAVI|   Math|   88|    B| Difficult|   2|
|ANANYA|Science|   92|    A| Difficult|   1|
| MEENA|Science|   85|    B| Difficult|   2|
+------+-------+-----+-----+----------+----+



#4. Ingest CSV & JSON – Save to Parquet
##Dataset 1: CSV file: students.csv

In [95]:
students_data = """id,name,department,city,salary
1,Amit,IT,Bangalore,78000
2,Kavya,HR,Chennai,62000
3,Arjun,Finance,Hyderabad,55000"""

with open('students.csv', 'w') as f:
  f.write(students_data)



In [96]:
# Load CSV with header
df_csv = spark.read.option("header", True).csv("students.csv", inferSchema=True)
df_csv.printSchema()
df_csv.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+



In [97]:
json_data = """[
  {
    "id": 101,
    "name": "Sneha",
    "address": {
      "city": "Mumbai",
      "pincode": 400001
    },
    "skills": ["Python", "Spark"]
  }
]
"""
with open('employee_nested.json', 'w') as f:
  f.write(json_data)

In [98]:
# Load nested JSON
df_json = spark.read.option("multiline", True).json("employee_nested.json")
df_json.printSchema()
df_json.show()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------+---+-----+---------------+
|         address| id| name|         skills|
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+



In [99]:
# Flatten the JSON (use explode , select , alias ).
from pyspark.sql.functions import explode, col
df_flat = df_json.select("id","name",col("address.city").alias("city"),col("address.pincode").alias("pincode"),explode("skills").alias("skill"))
df_flat.show()

+---+-----+------+-------+------+
| id| name|  city|pincode| skill|
+---+-----+------+-------+------+
|101|Sneha|Mumbai| 400001|Python|
|101|Sneha|Mumbai| 400001| Spark|
+---+-----+------+-------+------+



In [100]:
# Convert both to Parquet and write to /tmp/output .
df_csv.write.mode("overwrite").parquet("/tmp/output/students_parquet")
df_flat.write.mode("overwrite").parquet("/tmp/output/employees_parquet")

#5. Spark SQL – Temp Views & Queries
##Exercise 5.1 Create view from exam scores and run:

In [101]:
df_scores.createOrReplaceTempView("exam_scores")

In [102]:
# a) Top scorer per subject
spark.sql("select subject , max(score) as top_score from exam_scores group by subject order by top_score Desc ").show()

+-------+---------+
|subject|top_score|
+-------+---------+
|   Math|       94|
|Science|       92|
|English|       79|
+-------+---------+



In [103]:
# b) Count of students per grade
spark.sql("select grade , count(*) as no_of_students from exam_scores group by grade order by grade ").show()

+-----+--------------+
|grade|no_of_students|
+-----+--------------+
|    A|             2|
|    B|             2|
|    C|             1|
|    D|             1|
+-----+--------------+



In [104]:
# c) Students with multiple subjects
spark.sql("select name , count(distinct subject) as no_of_subjects from exam_scores group by name having no_of_subjects > 1 ").show()

+----+--------------+
|name|no_of_subjects|
+----+--------------+
|RAVI|             2|
+----+--------------+



In [105]:
# d) Subjects with average score above 85
spark.sql("select subject , avg(score) as avg_score from exam_scores group by subject having avg_score > 85 ").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
+-------+---------+



##Exercise 5.2 Create another DataFrame attendance(name, days_present) and:

In [106]:
attendance_data = [
    ("RAVI", 18),
    ("ANANYA", 22),
    ("KAVYA", 20),
    ("NEHA", 19),
    ("MEENA", 23)
]
columns = ["name", "days_present"]
df_attendance = spark.createDataFrame(attendance_data, columns)

In [107]:
# Join with scores
df_joined = df_scores.join(df_attendance, on="name", how="left")
df_joined.show()

+------+-------+-----+-----+----------+----+------------+
|  name|subject|score|grade|difficulty|rank|days_present|
+------+-------+-----+-----+----------+----+------------+
|  NEHA|   Math|   94|    A| Difficult|   1|          19|
| MEENA|Science|   85|    B| Difficult|   2|          23|
|  RAVI|English|   67|    D|      Easy|   2|          18|
|  RAVI|   Math|   88|    B| Difficult|   2|          18|
|ANANYA|Science|   92|    A| Difficult|   1|          22|
| KAVYA|English|   79|    C|      Easy|   1|          20|
+------+-------+-----+-----+----------+----+------------+



In [108]:
# Calculate attendance-adjusted grade:
# If days_present < 20 → downgrade grade by one level
df_adjust = df_joined.withColumn("adj_grade", when((df_joined.days_present < 20) & (df_joined.grade == "A"), "B")
.when((df_joined.days_present < 20) & (df_joined.grade == "B"), "C").when((df_joined.days_present < 20) & (df_joined.grade == "C"), "D")
.otherwise(df_joined.grade))

df_adjust.show()


+------+-------+-----+-----+----------+----+------------+---------+
|  name|subject|score|grade|difficulty|rank|days_present|adj_grade|
+------+-------+-----+-----+----------+----+------------+---------+
|  NEHA|   Math|   94|    A| Difficult|   1|          19|        B|
| MEENA|Science|   85|    B| Difficult|   2|          23|        B|
|  RAVI|English|   67|    D|      Easy|   2|          18|        D|
|  RAVI|   Math|   88|    B| Difficult|   2|          18|        C|
|ANANYA|Science|   92|    A| Difficult|   1|          22|        A|
| KAVYA|English|   79|    C|      Easy|   1|          20|        C|
+------+-------+-----+-----+----------+----+------------+---------+



#6. Partitioned Load (Full + Incremental)

In [109]:
# Initial Load:
df_scores.write.partitionBy("subject").parquet("/tmp/scores/", mode = 'overwrite')
incremental = [("Meena", "Math", 93)]
columns = ["name", "subject", "score"]
df_inc = spark.createDataFrame(incremental, columns)
df_inc = df_inc.withColumn("grade", when(df_inc.score >= 90, "A").when((df_inc.score >= 80) & (df_inc.score < 90), "B")
.when((df_inc.score >= 70) & (df_inc.score < 80), "C").otherwise("D"))
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

In [110]:
# List all folders inside /tmp/scores/
df_all = spark.read.parquet("/tmp/scores/")
df_all.show()

+------+-----+-----+----------+----+-------+
|  name|score|grade|difficulty|rank|subject|
+------+-----+-----+----------+----+-------+
|ANANYA|   92|    A| Difficult|   1|Science|
| MEENA|   85|    B| Difficult|   2|Science|
|  NEHA|   94|    A| Difficult|   1|   Math|
|  RAVI|   88|    B| Difficult|   2|   Math|
| KAVYA|   79|    C|      Easy|   1|English|
|  RAVI|   67|    D|      Easy|   2|English|
| Meena|   93|    A|      NULL|NULL|   Math|
+------+-----+-----+----------+----+-------+



In [111]:
# Read only Math partition and display all entries.
df_math = spark.read.parquet("/tmp/scores/subject=Math")
df_math.show()

+-----+-----+-----+----------+----+
| name|score|grade|difficulty|rank|
+-----+-----+-----+----------+----+
| NEHA|   94|    A| Difficult|   1|
| RAVI|   88|    B| Difficult|   2|
|Meena|   93|    A|      NULL|NULL|
+-----+-----+-----+----------+----+



#7. ETL: Clean, Transform, Load

In [112]:
data = """emp_id,name,dept,salary,bonus
1,Arjun,IT,78000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,55000,3000"""
with open('employee_data.csv','w') as f:
  f.write(data)

In [113]:
# Load data with header.
df = spark.read.option("header", True).csv("employee_data.csv", inferSchema=True)
df.show()


+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| NULL|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [114]:
# Fill missing bonus with 2000.
df = df.fillna({"bonus" : 2000})
df.show()

+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| 2000|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [115]:
# Calculate total_ctc = salary + bonus .
from pyspark.sql.functions import col
df = df.withColumn('total_ctc', col('salary') + col('bonus'))
df.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 78000| 5000|    83000|
|     2|Kavya|     HR| 62000| 2000|    64000|
|     3|Sneha|Finance| 55000| 3000|    58000|
+------+-----+-------+------+-----+---------+



In [116]:
# Filter where total_ctc > 60,000.
df_filter = df.filter(col("total_ctc") > 60000)
df_filter.show()

+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+



In [117]:
# Save final DataFrame to Parquet and JSON.
# i am saving the filterd data
df_filter.write.mode("overwrite").parquet("/tmp/final_employees_parquet")
df_filter.write.mode("overwrite").json("/tmp/final_employees_json")