#1. PySpark Setup & Initialization

Exercise 1.1 – Setup Spark:

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("BotCampus Intermediate Session") \
.master("local[*]") \
.getOrCreate()

Exercise 1.2 – Load starter data:

In [2]:
data = [("Ananya", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



#2. RDDs & Transformations


Exercise 2.1 – Create RDD from feedback:

In [4]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the mobile app",
"Meena from Delhi reported poor response time",
"Ajay from Pune liked the delivery speed",
"Ananya from Hyderabad had an issue with UI",
"Rohit from Mumbai gave positive feedback"
])

Tasks:
Count total number of words.


In [7]:
word_count = feedback.flatMap(lambda line: line.split()).count()
print(word_count)

35


Find top 3 most common words.


In [6]:
top_3_words = ( feedback
            .flatMap(lambda line: line.split())
            .map(lambda word: (word.lower(), 1))
            .reduceByKey(lambda a, b: a + b)
            .takeOrdered(3, key=lambda x: -x[1]))
print(top_3_words)


[('from', 5), ('the', 2), ('loved', 1)]


Remove stop words ( from , with , the , etc.).


In [9]:
stop_words = {"from", "with", "the", "an", "and", "had", "of", "a", "to"}
filtered_words = (feedback
            .flatMap(lambda line: line.split())
            .map(lambda word: word.lower())
            .filter(lambda word: word not in stop_words))



Create a dictionary of word → count

In [10]:
word_count_dict = (
    filtered_words.map(lambda word: (word, 1))
                  .reduceByKey(lambda a, b: a + b)
                  .collectAsMap()
)

print( word_count_dict)


{'loved': 1, 'app': 1, 'poor': 1, 'response': 1, 'liked': 1, 'speed': 1, 'ananya': 1, 'issue': 1, 'rohit': 1, 'mumbai': 1, 'positive': 1, 'feedback': 1, 'ravi': 1, 'bangalore': 1, 'mobile': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'delivery': 1, 'hyderabad': 1, 'ui': 1, 'gave': 1}


#3. DataFrames – Transformations

Exercise 3.1 – Create exam_scores DataFrame:

In [11]:
scores = [
("Ravi", "Math", 88),
("Ananya", "Science", 92),
("Kavya", "English", 79),
("Ravi", "English", 67),
("Neha", "Math", 94),
("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

Tasks:
Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).


In [14]:
from pyspark.sql.functions import when,col
df_with_grade = df_scores.withColumn(
    "grade", when(df_scores.score >= 90, "A")
    .when(df_scores.score >= 80, "B")
    .when(df_scores.score >= 70, "C")
    .otherwise("D")
)
df_with_grade.show()


+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



Group by subject, find average score.


In [13]:
df_avg_subject = df_scores.groupBy("subject").avg("score").withColumnRenamed("avg(score)", "avg_score")
df_avg_subject.show()


+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
|English|     73.0|
+-------+---------+



Use when and otherwise to classify subject difficulty ( Math/Science =
Difficult).


In [15]:
df_with_difficulty = df_with_grade.withColumn(
    "difficulty",when(col("subject").isin("Math", "Science"), "Difficult").otherwise("Easy"))
df_with_difficulty.show()

+------+-------+-----+-----+----------+
|  name|subject|score|grade|difficulty|
+------+-------+-----+-----+----------+
|  Ravi|   Math|   88|    B| Difficult|
|Ananya|Science|   92|    A| Difficult|
| Kavya|English|   79|    C|      Easy|
|  Ravi|English|   67|    D|      Easy|
|  Neha|   Math|   94|    A| Difficult|
| Meena|Science|   85|    B| Difficult|
+------+-------+-----+-----+----------+



Rank students per subject using Window function.


In [16]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
window_spec = Window.partitionBy("subject").orderBy(col("score").desc())
df_ranked = df_with_difficulty.withColumn("rank", rank().over(window_spec))
df_ranked.show()


+------+-------+-----+-----+----------+----+
|  name|subject|score|grade|difficulty|rank|
+------+-------+-----+-----+----------+----+
| Kavya|English|   79|    C|      Easy|   1|
|  Ravi|English|   67|    D|      Easy|   2|
|  Neha|   Math|   94|    A| Difficult|   1|
|  Ravi|   Math|   88|    B| Difficult|   2|
|Ananya|Science|   92|    A| Difficult|   1|
| Meena|Science|   85|    B| Difficult|   2|
+------+-------+-----+-----+----------+----+



Apply UDF to format names (e.g., make all uppercase).

In [19]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def format_name(name):
    return name.upper()

uppercase_udf = udf(format_name, StringType())
df_final = df_ranked.withColumn("formatted_name", uppercase_udf(col("name")))
df_final.select("formatted_name", "subject", "score", "grade", "difficulty", "rank").show()


+--------------+-------+-----+-----+----------+----+
|formatted_name|subject|score|grade|difficulty|rank|
+--------------+-------+-----+-----+----------+----+
|         KAVYA|English|   79|    C|      Easy|   1|
|          RAVI|English|   67|    D|      Easy|   2|
|          NEHA|   Math|   94|    A| Difficult|   1|
|          RAVI|   Math|   88|    B| Difficult|   2|
|        ANANYA|Science|   92|    A| Difficult|   1|
|         MEENA|Science|   85|    B| Difficult|   2|
+--------------+-------+-----+-----+----------+----+



#4. Ingest CSV & JSON – Save to Parquet


Dataset 1: CSV file: students.csv

Dataset 2: JSON file employee_nested.json

In [20]:
from google.colab import files
uploaded = files.upload()

Saving students.csv to students.csv
Saving employee_nested.json to employee_nested.json


In [21]:
students_df = spark.read.option("header", True).option("inferSchema", True).csv("students.csv")
students_df.printSchema()
students_df.show()


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+



In [22]:
employee_df = spark.read.option("multiline", True).json("employee_nested.json")
employee_df.printSchema()
employee_df.show(truncate=False)


root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------+---+-----+---------------+
|address         |id |name |skills         |
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+



In [23]:
from pyspark.sql.functions import col, explode
employee_flat = employee_df.select(
    col("id"),
    col("name"),
    col("address.city").alias("city"),
    col("address.pincode").alias("pincode"),
    explode(col("skills")).alias("skill")
)
employee_flat.show()


+---+-----+------+-------+------+
| id| name|  city|pincode| skill|
+---+-----+------+-------+------+
|101|Sneha|Mumbai| 400001|Python|
|101|Sneha|Mumbai| 400001| Spark|
+---+-----+------+-------+------+



In [24]:
students_df.write.mode("overwrite").parquet("/tmp/output/students_parquet")


In [25]:
employee_flat.write.mode("overwrite").parquet("/tmp/output/employees_parquet")


#5. Spark SQL – Temp Views & Queries

Exercise 5.1 Create view from exam scores and run:
-- a) Top scorer per subject
-- b) Count of students per grade
-- c) Students with multiple subjects
-- d) Subjects with average score above 85

In [26]:
df_with_grade.createOrReplaceTempView("exam_scores")


In [30]:
spark.sql("""
select subject, name, score
from (
  select *, rank() over (partition by subject order by score desc) as rnk
  from exam_scores
)
where rnk = 1
""").show()



+-------+------+-----+
|subject|  name|score|
+-------+------+-----+
|English| Kavya|   79|
|   Math|  Neha|   94|
|Science|Ananya|   92|
+-------+------+-----+



In [31]:
spark.sql("""
select grade, count(*) as student_count
from exam_scores
group by grade
""").show()


+-----+-------------+
|grade|student_count|
+-----+-------------+
|    B|            2|
|    C|            1|
|    A|            2|
|    D|            1|
+-----+-------------+



In [32]:
spark.sql("""
select name, count(subject) as subject_count
from exam_scores
group by name
having count (subject) > 1
""").show()


+----+-------------+
|name|subject_count|
+----+-------------+
|Ravi|            2|
+----+-------------+



In [33]:
spark.sql("""
select subject, AVG(score) as avg_score
from exam_scores
group by subject
having AVG(score) > 85
""").show()


+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
+-------+---------+



Exercise 5.2 Create another DataFrame attendance(name, days_present) and:

Join with scores
Calculate attendance-adjusted grade:
If days_present < 20 → downgrade grade by one level

In [34]:
attendance_data = [("Ravi", 18), ("Ananya", 22), ("Kavya", 25), ("Neha", 19), ("Meena", 23)]
attendance_columns = ["name", "days_present"]
attendance_df = spark.createDataFrame(attendance_data, attendance_columns)


In [39]:
joined_df = df_with_grade.join(attendance_df, on="name", how="left")
joined_df.show()


+------+-------+-----+-----+------------+
|  name|subject|score|grade|days_present|
+------+-------+-----+-----+------------+
|Ananya|Science|   92|    A|          22|
|  Ravi|   Math|   88|    B|          18|
| Kavya|English|   79|    C|          25|
|  Ravi|English|   67|    D|          18|
|  Neha|   Math|   94|    A|          19|
| Meena|Science|   85|    B|          23|
+------+-------+-----+-----+------------+



In [40]:
from pyspark.sql.functions import col, when
adjusted_df = joined_df.withColumn("adj_grade",
         when(col("days_present") < 20,
         when(col("grade") == "A", "B")
         .when(col("grade") == "B", "C")
         .when(col("grade") == "C", "D")
         .when(col("grade") == "D", "F")
         .otherwise(col("grade"))
    ).otherwise(col("grade"))
)
adjusted_df.select("name", "subject", "score", "grade", "days_present", "adj_grade").show()

+------+-------+-----+-----+------------+---------+
|  name|subject|score|grade|days_present|adj_grade|
+------+-------+-----+-----+------------+---------+
|Ananya|Science|   92|    A|          22|        A|
|  Ravi|   Math|   88|    B|          18|        C|
| Kavya|English|   79|    C|          25|        C|
|  Ravi|English|   67|    D|          18|        F|
|  Neha|   Math|   94|    A|          19|        B|
| Meena|Science|   85|    B|          23|        B|
+------+-------+-----+-----+------------+---------+



#6. Partitioned Load (Full + Incremental)

In [41]:
df_scores.write.partitionBy("subject").parquet("/tmp/scores/")

In [42]:
incremental = [("Meena", "Math", 93)]
df_inc = spark.createDataFrame(incremental, columns)
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

In [None]:
import os
partitions = os.listdir("/tmp/scores/")
print("Partitions folders inside /tmp/scores/:")
for folder in partitions:
    print(folder)


In [44]:
math_df = spark.read.parquet("/tmp/scores/subject=Math")
math_df.show()


+-----+-----+
| name|score|
+-----+-----+
|Meena|   93|
| Ravi|   88|
| Neha|   94|
+-----+-----+



#7. ETL: Clean, Transform, Load

In [45]:
from google.colab import files
uploaded = files.upload()

Saving raw_data.csv to raw_data.csv


In [46]:
df = spark.read.option("header", True).option("inferSchema", True).csv("raw_data.csv")
df.show()


+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| NULL|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [48]:
df_filled = df.fillna({'bonus': 2000})
df_filled.show()


+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| 2000|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [49]:
df_ctc = df_filled.withColumn("total_ctc", col("salary") + col("bonus"))
df_ctc.show()


+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 78000| 5000|    83000|
|     2|Kavya|     HR| 62000| 2000|    64000|
|     3|Sneha|Finance| 55000| 3000|    58000|
+------+-----+-------+------+-----+---------+



In [50]:
df_filtered = df_ctc.filter(col("total_ctc") > 60000)
df_filtered.show()

+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+



In [51]:
df_filtered.write.mode("overwrite").parquet("/tmp/employee_parquet")
df_filtered.write.mode("overwrite").json("/tmp/employee_json")
