#Module 1: Setup & SparkSession Initialization

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("BotCampus PySpark Practice") \
.master("local[*]") \
.getOrCreate()

In [3]:
data = [
("Anjali", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25),
("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()


+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)



In [4]:
rdd = df.rdd
print(df.collect())


[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]


In [5]:
df.rdd.map(lambda row: (row.name, row.age)).collect()



[('Anjali', 24), ('Ravi', 28), ('Kavya', 22), ('Meena', 25), ('Arjun', 30)]

#Module 2: RDDs & Transformations

In [None]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])
words = feedback.flatMap(lambda line: line.lower().split())
words.collect()


In [None]:
stop_words = {"from", "the", "a", "had", "an", "and", "in", "of", "to"}
filtered_words = words.filter(lambda word: word not in stop_words)
filtered_words.collect()


In [None]:
word_counts = filtered_words.map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a + b)
word_counts.collect()


In [None]:
top_3 = word_counts.takeOrdered(3, key=lambda x: -x[1])
print(top_3)


#Module 3: DataFrames & Transformation (With Joins)

In [17]:
students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
columns1 = ["name", "section", "marks"]
attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
columns2 = ["name", "days_present"]

df_students = spark.createDataFrame(students, columns1)
df_attendance = spark.createDataFrame(attendance, columns2)

In [52]:
df_joined = df_students.join(df_attendance, on="name", how="inner")
df_joined.show()


+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



In [53]:
from pyspark.sql.functions import col, when
df_joined = df_joined.withColumn( "attendance_rate",col("days_present") / 25)
df_joined.show()


+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
|Anjali|   10-A|   78|          20|            0.8|
| Kavya|   10-B|   92|          22|           0.88|
| Rohit|   10-B|   85|          25|            1.0|
| Sneha|   10-C|   80|          19|           0.76|
+------+-------+-----+------------+---------------+



In [54]:
df_joined = df_joined.withColumn(
    "grade",
    when(col("marks") > 90, "A")
    .when((col("marks") >= 80) & (col("marks") <= 90), "B")
    .otherwise("C")
)
df_joined.show()


+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           0.96|    B|
|Anjali|   10-A|   78|          20|            0.8|    C|
| Kavya|   10-B|   92|          22|           0.88|    A|
| Rohit|   10-B|   85|          25|            1.0|    B|
| Sneha|   10-C|   80|          19|           0.76|    B|
+------+-------+-----+------------+---------------+-----+



In [56]:
final_df = df_joined.filter(((col("grade") == "A") | (col("grade") == "B")) & (col("attendance_rate") < 0.8)).show()

+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           0.76|    B|
+-----+-------+-----+------------+---------------+-----+



#Module 4: Ingest CSV & JSON, Save to Parquet

In [58]:
from google.colab import files
uploaded = files.upload()

Saving profile.json to profile.json


In [59]:
csv_df = spark.read.option("header", True).option("inferSchema", True).csv("employees.csv")
csv_df.show()


+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [62]:
json_df = spark.read.option("multiline", True).json("profile.json")
json_df.printSchema()
json_df.show(truncate=False)


root
 |-- contact: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+------------------------------+---+-------+--------------------+
|contact                       |id |name   |skills              |
+------------------------------+---+-------+--------------------+
|{Hyderabad, nandi@example.com}|201|Nandini|[Python, Spark, SQL]|
+------------------------------+---+-------+--------------------+



In [63]:
from pyspark.sql.functions import col, explode
flat_json_df = json_df.select(
    col("id"),
    col("name"),
    col("contact.email").alias("email"),
    col("contact.city").alias("city"),
    explode(col("skills")).alias("skill")
)
flat_json_df.show(truncate=False)


+---+-------+-----------------+---------+------+
|id |name   |email            |city     |skill |
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad|Spark |
|201|Nandini|nandi@example.com|Hyderabad|SQL   |
+---+-------+-----------------+---------+------+



In [64]:
csv_df.write.mode("overwrite").partitionBy("city").parquet("/tmp/employees_parquet/")
flat_json_df.write.mode("overwrite").partitionBy("city").parquet("/tmp/profile_parquet/")


#Module 5: Spark SQL with Temp Views

In [65]:
df_joined.createOrReplaceTempView("students_view")


In [66]:
spark.sql("""
select section, round(avg(marks), 2) as avg_marks
from students_view
group by section
""").show()

+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-C|     80.0|
|   10-A|     83.5|
|   10-B|     88.5|
+-------+---------+



In [67]:
spark.sql("""
select section, name, marks
from students_view s1
where marks = (
  select max(marks)
  from students_view s2
  where s1.section = s2.section
)
""").show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



In [68]:
spark.sql("""
select grade, count(*) as grade_count
from students_view
group by grade
""").show()

+-----+-----------+
|grade|grade_count|
+-----+-----------+
|    B|          3|
|    C|          1|
|    A|          1|
+-----+-----------+



In [69]:
spark.sql("""
select name, marks
from students_view
where marks > (
  select avg(marks)
  from students_view
)
""").show()

+-----+-----+
| name|marks|
+-----+-----+
| Amit|   89|
|Kavya|   92|
|Rohit|   85|
+-----+-----+



In [70]:
spark.sql("""
select name, section, marks, attendance_rate,
round(marks * attendance_rate, 2) as adjusted_score
from students_view
order by adjusted_score desc
""").show()

+------+-------+-----+---------------+--------------+
|  name|section|marks|attendance_rate|adjusted_score|
+------+-------+-----+---------------+--------------+
|  Amit|   10-A|   89|           0.96|         85.44|
| Rohit|   10-B|   85|            1.0|          85.0|
| Kavya|   10-B|   92|           0.88|         80.96|
|Anjali|   10-A|   78|            0.8|          62.4|
| Sneha|   10-C|   80|           0.76|          60.8|
+------+-------+-----+---------------+--------------+



#Module 6: Partitioned Data & Incremental Loading

In [71]:
df_students.write.partitionBy("section").parquet("output/students/")

In [72]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

In [73]:
import os
base_path = "output/students/"
for root, dirs, files in os.walk(base_path):
    print(f"Directory: {root}")
    for file in files:
        print(f"  {file}")


Directory: output/students/
  ._SUCCESS.crc
  _SUCCESS
Directory: output/students/section=10-A
  part-00001-6c7e818f-4697-4606-bb7a-78d89c4147bd.c000.snappy.parquet
  .part-00001-6c7e818f-4697-4606-bb7a-78d89c4147bd.c000.snappy.parquet.crc
  .part-00000-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet.crc
  part-00001-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet
  .part-00001-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet.crc
  part-00000-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet
Directory: output/students/section=10-B
  .part-00000-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet.crc
  part-00001-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet
  .part-00001-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet.crc
  part-00000-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet
Directory: output/students/section=10-C
  part-00001-1a3e0cd1-6f6b-4988-8b2c-0c4ed86a8a86.c000.snappy.parquet
  .part-00001-1a3e0cd1-6f6b-49

In [74]:
df_10A = spark.read.parquet("output/students/section=10-A/")
df_10A.show()

+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



In [75]:
count_after = df_10A.count()
print(f"Number of students in section 10-A after incremental load: {count_after}")


Number of students in section 10-A after incremental load: 3


#Module 7: ETL Pipeline – End to End

In [76]:
from google.colab import files
uploaded = files.upload()

Saving employee.csv to employee.csv


In [77]:
df = spark.read.option("header", True).option("inferSchema", True).csv("employee.csv")
df.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| NULL|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| NULL|
+------+------+-------+------+-----+



In [78]:
df_filled = df.fillna({"bonus": 2000})
df_filled.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+



In [81]:
df_ctc = df_filled.withColumn("total_ctc", col("salary") + col("bonus"))
df_ctc.show()


+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



In [82]:
df_filtered = df_ctc.filter(col("total_ctc") > 65000)
df_filtered.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



In [83]:
df_filtered.write.mode("overwrite").json("/tmp/output/employee_json/")
df_filtered.write.mode("overwrite").partitionBy("dept").parquet("/tmp/output/employee_parquet/")
