#Module 1: Setup & SparkSession Initialization
Tasks:
* Install and configure PySpark in your local system or Colab.
* Initialize Spark with:

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("BotCampus PySpark Practice") \
.master("local[*]") \
.getOrCreate()

* Create a DataFrame from:

In [3]:
data = [
("Anjali", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25),
("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]

* Show schema, explain data types, and convert to RDD.

In [6]:
df = spark.createDataFrame(data, columns)
df.printSchema()

rdd = df.rdd
print(rdd.collect())

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)

[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]


* Print .collect() and df.rdd.map() output.

In [7]:

names_upper = df.rdd.map(lambda row: row.name.upper())
print(names_upper.collect())

['ANJALI', 'RAVI', 'KAVYA', 'MEENA', 'ARJUN']


# Module 2: RDDs & Transformations

* Scenario: You received app feedback from users in free-text.

In [8]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])

Tasks:
1. Split each line into words ( flatMap ).
2. Remove stop words ( from , the , etc.).
3. Count each word frequency using reduceByKey .
4. Find top 3 most frequent non-stop words.

In [9]:
#1
words = feedback.flatMap(lambda line: line.lower().split())


In [10]:
#2
stop_words = {"from", "the", "a", "had", "in", "with", "and"}

filtered_words = words.filter(lambda word: word not in stop_words)


In [11]:
#3
word_pairs = filtered_words.map(lambda word: (word, 1))
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)


In [12]:
#4
top_3 = word_counts.takeOrdered(3, key=lambda x: -x[1])

print("Top 3 most frequent non-stop words:")
for word, count in top_3:
    print(f"{word}: {count}")


Top 3 most frequent non-stop words:
loved: 1
liked: 1
service: 1


#Module 3: DataFrames & Transformation (With Joins)
DataFrames:


In [13]:
students = [
("Amit", "10-A", 89),

("Kavya", "10-B", 92),
("Anjali", "10-A", 78),
("Rohit", "10-B", 85),
("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]
attendance = [
("Amit", 24),
("Kavya", 22),
("Anjali", 20),
("Rohit", 25),
("Sneha", 19)
]
columns2 = ["name", "days_present"]

Tasks:
1. Join both DataFrames on name .
2. Create a new column: attendance_rate = days_present / 25 .
3. Grade students using when :
A: >90, B: 80â€“90, C: <80.
4. Filter students with good grades but poor attendance (<80%).


In [15]:
#1.
df_students = spark.createDataFrame(students, columns)
df_attendance = spark.createDataFrame(attendance, columns2)
df_joined = df_students.join(df_attendance, on="name")

In [17]:
#2.
from pyspark.sql.functions import col

df_with_attendance = df_joined.withColumn(
    "attendance_rate", col("days_present") / 25
)

In [21]:
#3
from pyspark.sql.functions import when, col

df_graded = df_with_attendance.withColumn(
    "grade",
    when(col("marks") > 90, "A")
    .when((col("marks") >= 80) & (col("marks") <= 90), "B")
    .otherwise("C")
)

In [20]:
#4.
filtered = df_graded.filter(
    (col("grade").isin("A", "B")) & (col("attendance_rate") < 0.8)
)

filtered.show()


+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           0.76|    B|
+-----+-------+-----+------------+---------------+-----+



#Module 4: Ingest CSV & JSON, Save to Parquet
Tasks:


In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode

spark = SparkSession.builder \
    .appName("Ingest CSV & JSON and Save Parquet") \
    .master("local[*]") \
    .getOrCreate()


In [25]:
from google.colab import files
uploaded = files.upload()


Saving employees.csv to employees (1).csv
Saving profile.json to profile.json


* read both formats into DataFrames.

In [26]:
df_csv = spark.read.option("header", True).csv("/content/employees.csv", inferSchema=True)
df_csv.show()


+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [27]:
df_json = spark.read.option("multiLine", True).json("/content/profile.json")
df_json.printSchema()
df_json.show()


root
 |-- contact: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+---+-------+--------------------+
|             contact| id|   name|              skills|
+--------------------+---+-------+--------------------+
|{Hyderabad, nandi...|201|Nandini|[Python, Spark, SQL]|
+--------------------+---+-------+--------------------+



* Flatten nested JSON using select , col , alias , explode .

In [28]:
from pyspark.sql.functions import explode

# Flatten the nested structure and explode the skills array
df_flat = df_json.select(
    col("id"),
    col("name"),
    col("contact.city").alias("city"),
    col("contact.email").alias("email"),
    explode(col("skills")).alias("skill")
)

df_flat.show(truncate=False)


+---+-------+---------+-----------------+------+
|id |name   |city     |email            |skill |
+---+-------+---------+-----------------+------+
|201|Nandini|Hyderabad|nandi@example.com|Python|
|201|Nandini|Hyderabad|nandi@example.com|Spark |
|201|Nandini|Hyderabad|nandi@example.com|SQL   |
+---+-------+---------+-----------------+------+



* Save Both as Parquet Partitioned by City

In [29]:
# Save CSV DataFrame to Parquet partitioned by city
df_csv.write.mode("overwrite").partitionBy("city").parquet("/content/output/csv_partitioned")

# Save Flattened JSON DataFrame to Parquet partitioned by city
df_flat.write.mode("overwrite").partitionBy("city").parquet("/content/output/json_partitioned")


In [30]:
import os

print("CSV Output Partitions:", os.listdir("/content/output/csv_partitioned"))
print("JSON Output Partitions:", os.listdir("/content/output/json_partitioned"))


CSV Output Partitions: ['._SUCCESS.crc', 'city=Mumbai', 'city=Bangalore', '_SUCCESS', 'city=Chennai']
JSON Output Partitions: ['._SUCCESS.crc', '_SUCCESS', 'city=Hyderabad']


#Module 5: Spark SQL with Temp Views
ðŸ”¹ Step 1: Setup Spark & Create DataFrames
python
Copy
Edit


Tasks:
* Register the students DataFrame as students_view .

In [31]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder \
    .appName("SparkSQL_TempViews") \
    .getOrCreate()

# Students Data
students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
students_columns = ["name", "section", "marks"]

attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
attendance_columns = ["name", "days_present"]

# Create DataFrames
df_students = spark.createDataFrame(students, students_columns)
df_attendance = spark.createDataFrame(attendance, attendance_columns)

# Join both DataFrames
df_combined = df_students.join(df_attendance, on="name")

# Register as Temp View
df_combined.createOrReplaceTempView("students_view")



Write and run the following queries:
* a) Average marks per section
* b) Top scorer in each section
* c) Count of students in each grade category
* d) Students with marks above class average
* e) Attendance-adjusted performance

In [32]:
# a) Average marks per section
spark.sql("""
SELECT section, ROUND(AVG(marks), 2) AS avg_marks
FROM students_view
GROUP BY section
""").show()


+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-C|     80.0|
|   10-A|     83.5|
|   10-B|     88.5|
+-------+---------+



In [33]:
# b)
spark.sql("""
SELECT section, name, marks
FROM (
    SELECT section, name, marks,
           RANK() OVER (PARTITION BY section ORDER BY marks DESC) as rnk
    FROM students_view
) WHERE rnk = 1
""").show()


+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



In [34]:
# c)
spark.sql("""
SELECT
  CASE
    WHEN marks > 90 THEN 'A'
    WHEN marks BETWEEN 80 AND 90 THEN 'B'
    ELSE 'C'
  END AS grade,
  COUNT(*) AS count
FROM students_view
GROUP BY grade
""").show()


+-----+-----+
|grade|count|
+-----+-----+
|    B|    3|
|    C|    1|
|    A|    1|
+-----+-----+



In [35]:
# d)
spark.sql("""
SELECT name, section, marks
FROM students_view
WHERE marks > (SELECT AVG(marks) FROM students_view)
""").show()


+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
| Amit|   10-A|   89|
|Kavya|   10-B|   92|
|Rohit|   10-B|   85|
+-----+-------+-----+



In [36]:
# e)
spark.sql("""
SELECT name, section, marks, days_present,
       ROUND((marks * (days_present / 25)), 2) AS adj_performance
FROM students_view
""").show()

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|adj_performance|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|          85.44|
|Anjali|   10-A|   78|          20|           62.4|
| Kavya|   10-B|   92|          22|          80.96|
| Rohit|   10-B|   85|          25|           85.0|
| Sneha|   10-C|   80|          19|           60.8|
+------+-------+-----+------------+---------------+



#Module 6: Partitioned Data & Incremental Loading
Step 1: Full Load

In [38]:
df_students.write.mode("overwrite").partitionBy("section").parquet("output/students/")

incremental load

In [39]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

Tasks:
* List files in output/students/ using Python.
* Read only partition 10-A and list students.
* Compare before/after counts for section 10-A .

In [40]:
import os

files = os.listdir("output/students/")
for f in files:
    print(f)


._SUCCESS.crc
section=10-A
section=10-B
_SUCCESS
section=10-C


In [41]:
df_10a = spark.read.parquet("output/students/section=10-A")
df_10a.show()


+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



In [42]:
# After full load
df_full = spark.read.parquet("output/students/")
count_before = df_full.filter(df_full.section == "10-A").count()

# Append incremental and read again
df_after = spark.read.parquet("output/students/")
count_after = df_after.filter(df_after.section == "10-A").count()

print(f"Before Increment: {count_before}")
print(f"After Increment: {count_after}")


Before Increment: 3
After Increment: 3


# Module 7: ETL Pipeline â€“ End to End
  

  tasks
  1. Load CSV with inferred schema.
2. Fill null bonuses with 2000 .
3. Create total_ctc = salary + bonus .
4. Filter employees with total_ctc > 65000 .
5. Save result in:

a) JSON format.
b) Parquet format partitioned by department.

In [43]:
#1
df_raw = spark.read.option("header", True).option("inferSchema", True).csv("employees.csv")
df_raw.show()


+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [45]:
#2
from pyspark.sql.functions import lit

df_filled = df_raw.withColumn("bonus", lit(2000))
df_filled.show()

+------+-----+-------+---------+------+-----+
|emp_id| name|   dept|     city|salary|bonus|
+------+-----+-------+---------+------+-----+
|   101| Anil|     IT|Bangalore| 80000| 2000|
|   102|Kiran|     HR|   Mumbai| 65000| 2000|
|   103|Deepa|Finance|  Chennai| 72000| 2000|
+------+-----+-------+---------+------+-----+



In [46]:
#3
from pyspark.sql.functions import col

df_ctc = df_filled.withColumn("total_ctc", col("salary") + col("bonus"))


In [47]:
#4
df_filtered = df_ctc.filter(col("total_ctc") > 65000)
df_filtered.show()


+------+-----+-------+---------+------+-----+---------+
|emp_id| name|   dept|     city|salary|bonus|total_ctc|
+------+-----+-------+---------+------+-----+---------+
|   101| Anil|     IT|Bangalore| 80000| 2000|    82000|
|   102|Kiran|     HR|   Mumbai| 65000| 2000|    67000|
|   103|Deepa|Finance|  Chennai| 72000| 2000|    74000|
+------+-----+-------+---------+------+-----+---------+



In [50]:
#5
#a)
df_filtered.write.mode("overwrite").json("output/etl_employees_json/")


In [51]:
#5
#b)
df_filtered.write.mode("overwrite").partitionBy("dept").parquet("output/etl_employees_parquet/")


In [52]:
import os

print("JSON Output Files:")
for root, dirs, files in os.walk("output/etl_employees_json/"):
    for file in files:
        print(os.path.join(root, file))

print("\nParquet Output Files (Partitioned by dept):")
for root, dirs, files in os.walk("output/etl_employees_parquet/"):
    for file in files:
        print(os.path.join(root, file))


JSON Output Files:
output/etl_employees_json/._SUCCESS.crc
output/etl_employees_json/.part-00000-053cdeb0-0972-441e-9754-bdc1f2810cd4-c000.json.crc
output/etl_employees_json/part-00000-053cdeb0-0972-441e-9754-bdc1f2810cd4-c000.json
output/etl_employees_json/_SUCCESS

Parquet Output Files (Partitioned by dept):
output/etl_employees_parquet/._SUCCESS.crc
output/etl_employees_parquet/_SUCCESS
output/etl_employees_parquet/dept=HR/.part-00000-75c38bbe-578a-4ebe-b0ca-e5d65fb95ff8.c000.snappy.parquet.crc
output/etl_employees_parquet/dept=HR/part-00000-75c38bbe-578a-4ebe-b0ca-e5d65fb95ff8.c000.snappy.parquet
output/etl_employees_parquet/dept=IT/.part-00000-75c38bbe-578a-4ebe-b0ca-e5d65fb95ff8.c000.snappy.parquet.crc
output/etl_employees_parquet/dept=IT/part-00000-75c38bbe-578a-4ebe-b0ca-e5d65fb95ff8.c000.snappy.parquet
output/etl_employees_parquet/dept=Finance/.part-00000-75c38bbe-578a-4ebe-b0ca-e5d65fb95ff8.c000.snappy.parquet.crc
output/etl_employees_parquet/dept=Finance/part-00000-75c38bbe-