#Module 1: Setup & SparkSession Initialization

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BotCampus PySpark Practice").master("local[*]").getOrCreate()

In [2]:
data = [
    ("Anjali", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25),
    ("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+



In [5]:
# Show schema, explain data types, and convert to RDD.
df.printSchema()
df.dtypes

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)



[('name', 'string'), ('city', 'string'), ('age', 'bigint')]

In [6]:
rdd = df.rdd

In [7]:
# Print .collect() and df.rdd.map() output.
print(df.collect())

[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]


In [8]:
mapped_rdd = df.rdd.map(lambda row: (row.name, row.age + 1))
print(mapped_rdd.collect())

[('Anjali', 25), ('Ravi', 29), ('Kavya', 23), ('Meena', 26), ('Arjun', 31)]


#Module 2: RDDs & Transformations

In [9]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])

In [15]:
# Split each line into words ( flatMap ).
words = feedback.flatMap(lambda line: line.split())
words.collect()

['Ravi',
 'from',
 'Bangalore',
 'loved',
 'the',
 'delivery',
 'Meena',
 'from',
 'Hyderabad',
 'had',
 'a',
 'late',
 'order',
 'Ajay',
 'from',
 'Pune',
 'liked',
 'the',
 'service',
 'Anjali',
 'from',
 'Delhi',
 'faced',
 'UI',
 'issues',
 'Rohit',
 'from',
 'Mumbai',
 'gave',
 'positive',
 'feedback']

In [16]:
# Remove stop words ( from , the , etc.).
stop_words = {"from", "with", "the", "an", "and", "had", "gave"}
filtered_words = feedback.flatMap(lambda line: line.lower().split()).filter(lambda w: w not in stop_words)
print(filtered_words.collect())

['ravi', 'bangalore', 'loved', 'delivery', 'meena', 'hyderabad', 'a', 'late', 'order', 'ajay', 'pune', 'liked', 'service', 'anjali', 'delhi', 'faced', 'ui', 'issues', 'rohit', 'mumbai', 'positive', 'feedback']


In [17]:
# Count each word frequency using reduceByKey
word_count = words.map(lambda w: (w.lower(), 1)).reduceByKey(lambda a, b: a + b)
word_count.collect()

[('from', 5),
 ('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('the', 2),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('had', 1),
 ('a', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1),
 ('gave', 1)]

In [18]:
# Find top 3 most frequent non-stop words.
top3 = word_count.takeOrdered(3, key=lambda x: -x[1])
print("Top 3 common words: ", top3)

Top 3 common words:  [('from', 5), ('the', 2), ('loved', 1)]


#Module 3: DataFrames & Transformation (With Joins)

In [19]:
from pyspark.sql.functions import col, when

students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
column1 = ["name", "section", "marks"]

attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
column2 = ["name", "days_present"]

df_students = spark.createDataFrame(students, column1)
df_attendance = spark.createDataFrame(attendance, column2)

In [20]:
# Join both DataFrames on name .
df_joined = df_students.join(df_attendance, on="name")
df_joined.show()

+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



In [21]:
# Create a new column: attendance_rate = days_present / 25 .
df_attendance_rate = df_joined.withColumn('attendance_rate',col('days_present')/25)
df_attendance_rate.show()

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
|Anjali|   10-A|   78|          20|            0.8|
| Kavya|   10-B|   92|          22|           0.88|
| Rohit|   10-B|   85|          25|            1.0|
| Sneha|   10-C|   80|          19|           0.76|
+------+-------+-----+------------+---------------+



In [22]:
# Grade students using when :
# A: >90, B: 80–90, C: <80.
df_grade = df_attendance_rate.withColumn('grade',when(col("marks") > 90, "A").when((col("marks") >= 80) & (col("marks") <= 90), "B").otherwise("C"))
df_grade.show()

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           0.96|    B|
|Anjali|   10-A|   78|          20|            0.8|    C|
| Kavya|   10-B|   92|          22|           0.88|    A|
| Rohit|   10-B|   85|          25|            1.0|    B|
| Sneha|   10-C|   80|          19|           0.76|    B|
+------+-------+-----+------------+---------------+-----+



In [28]:
# Filter students with good grades but poor attendance (<80%).
df_filter = df_grade.filter(((col("grade") == "A") | (col("grade") == "B")) & (col("attendance_rate") < 0.8))
df_filter.show()

+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           0.76|    B|
+-----+-------+-----+------------+---------------+-----+



#Module 4: Ingest CSV & JSON, Save to Parquet

In [29]:
data = """emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000"""
with open('emp.csv','w') as f:
  f.write(data)

In [36]:
json_data = """{
"id": 201,
"name": "Nandini",
"contact": {
"email": "nandi@example.com",
"city": "Hyderabad"
},
"skills": ["Python", "Spark", "SQL"]
}"""
with open('employee.json','w') as f:
  f.write(json_data)

In [31]:
# Read both formats into DataFrames.
emp_df = spark.read.option("header", "true").option("inferSchema", "true").csv("emp.csv")
emp_df.show()

+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [37]:
json_df = spark.read.option("multiline", "true").json("employee.json")
json_df.show()

+--------------------+---+-------+--------------------+
|             contact| id|   name|              skills|
+--------------------+---+-------+--------------------+
|{Hyderabad, nandi...|201|Nandini|[Python, Spark, SQL]|
+--------------------+---+-------+--------------------+



In [38]:
# Flatten nested JSON using select , col , alias , explode .
from pyspark.sql.functions import col, explode
flat_df = json_df.select(col("id"),col("name"),col("contact.email").alias("email"),col("contact.city").alias("city"),explode(col("skills")).alias("skill"))
flat_df.show()

+---+-------+-----------------+---------+------+
| id|   name|            email|     city| skill|
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad| Spark|
|201|Nandini|nandi@example.com|Hyderabad|   SQL|
+---+-------+-----------------+---------+------+



In [39]:
# Save both as Parquet files partitioned by city.
emp_df.write.mode("overwrite").partitionBy("city").parquet("output/emp_parquet/")
flat_df.write.mode("overwrite").partitionBy("city").parquet("output/json_parquet/")

#Module 5: Spark SQL with Temp Views

In [40]:
# Register the students DataFrame as students_view .
df_grade.createOrReplaceTempView('students_view')

In [42]:
# a) Average marks per section
spark.sql("select section, avg(marks) as avg_mark from students_view group by section order by section").show()

+-------+--------+
|section|avg_mark|
+-------+--------+
|   10-A|    83.5|
|   10-B|    88.5|
|   10-C|    80.0|
+-------+--------+



In [44]:
# b) Top scorer in each section
spark.sql("select section, name, marks from students_view where (section, marks) in (select section , max(marks) from students_view group by section)").show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



In [45]:
# c) Count of students in each grade category
spark.sql("select grade, count(*) as students_count from students_view group by grade order by grade").show()

+-----+--------------+
|grade|students_count|
+-----+--------------+
|    A|             1|
|    B|             3|
|    C|             1|
+-----+--------------+



In [47]:
# d) Students with marks above class average
spark.sql("select * from students_view where marks > (select avg(marks) from students_view)").show()

+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
| Amit|   10-A|   89|          24|           0.96|    B|
|Kavya|   10-B|   92|          22|           0.88|    A|
|Rohit|   10-B|   85|          25|            1.0|    B|
+-----+-------+-----+------------+---------------+-----+



In [48]:
# e) Attendance-adjusted performance
spark.sql("select * , round(marks * attendance_rate, 2) as adjusted_score from students_view ").show()

+------+-------+-----+------------+---------------+-----+--------------+
|  name|section|marks|days_present|attendance_rate|grade|adjusted_score|
+------+-------+-----+------------+---------------+-----+--------------+
|  Amit|   10-A|   89|          24|           0.96|    B|         85.44|
|Anjali|   10-A|   78|          20|            0.8|    C|          62.4|
| Kavya|   10-B|   92|          22|           0.88|    A|         80.96|
| Rohit|   10-B|   85|          25|            1.0|    B|          85.0|
| Sneha|   10-C|   80|          19|           0.76|    B|          60.8|
+------+-------+-----+------------+---------------+-----+--------------+



#Module 6: Partitioned Data & Incremental Loading

In [49]:
df_students.write.partitionBy("section").parquet("output/students/")

In [50]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

In [51]:
# List files in output/students/ using Python.
df_all = spark.read.parquet("output/students/")
df_all.show()

+------+-----+-------+
|  name|marks|section|
+------+-----+-------+
|Anjali|   78|   10-A|
| Tejas|   91|   10-A|
| Rohit|   85|   10-B|
| Kavya|   92|   10-B|
| Sneha|   80|   10-C|
|  Amit|   89|   10-A|
+------+-----+-------+



In [54]:
# Read only partition 10-A and list students.
df_section = spark.read.parquet("output/students/section=10-A")
df_section.show()

+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



In [57]:
# Compare before/after counts for section 10-A .
before_count = df_students.filter("section = '10-A'").count()
after_count = df_all.filter("section = '10-A'").count()
print(f"Before: {before_count} students in 10-A")
print(f"After: {after_count} students in 10-A")


Before: 2 students in 10-A
After: 3 students in 10-A


#Module 7: ETL Pipeline – End to End

In [58]:
data2 = """emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000,
4,Ramesh,Sales,58000"""
with open('emp.csv','w') as f:
  f.write(data2)

In [60]:
# Load CSV with inferred schema.
df = spark.read.option("header", True).csv("emp.csv", inferSchema=True)
df.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| NULL|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| NULL|
+------+------+-------+------+-----+



In [61]:
# Fill null bonuses with 2000 .
df = df.fillna({'bonus': 2000})
df.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+



In [62]:
# Create total_ctc = salary + bonus .
df = df.withColumn('total_ctc', col('salary') + col('bonus'))
df.show()

+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



In [63]:
# Filter employees with total_ctc > 65000 .
df_filter = df.filter(col("total_ctc") > 65000)
df_filter.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



In [65]:
# Save result in:
# JSON format.
df_filter.write.mode("overwrite").json("output/emp_json")
# Parquet format partitioned by department.
df_filter.write.mode("overwrite").partitionBy("dept").parquet("output/emp_parquet")