### Module 1: Setup & SparkSession Initialization

In [1]:
# Creating Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BotCampus PySpark Practice").master("local[*]").getOrCreate()

In [2]:
# Creating DataFrame
data = [
("Anjali", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25),
("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data,columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+



In [3]:
# Schema
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)



In [4]:
# Datatypes
print(df.dtypes)

[('name', 'string'), ('city', 'string'), ('age', 'bigint')]


In [5]:
# Converting to DataFrame
rdd = df.rdd
print(rdd.collect())

[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]


In [6]:
mapped_rdd = df.rdd.map(lambda row: (row.name.upper(), row.age))
print("df.rdd.map() output:")
for item in mapped_rdd.collect():
    print(item)


df.rdd.map() output:
('ANJALI', 24)
('RAVI', 28)
('KAVYA', 22)
('MEENA', 25)
('ARJUN', 30)


###  Module 2: RDDs & Transformations

In [7]:
# Feedback RDD
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])

In [8]:
# Split each line into words (flatMap )
split_words = (feedback.flatMap(lambda line: line.split()))
split_words.collect()

['Ravi',
 'from',
 'Bangalore',
 'loved',
 'the',
 'delivery',
 'Meena',
 'from',
 'Hyderabad',
 'had',
 'a',
 'late',
 'order',
 'Ajay',
 'from',
 'Pune',
 'liked',
 'the',
 'service',
 'Anjali',
 'from',
 'Delhi',
 'faced',
 'UI',
 'issues',
 'Rohit',
 'from',
 'Mumbai',
 'gave',
 'positive',
 'feedback']

In [9]:
# Remove Stop Words
# Creating a list of stop words
stop_words = {"from", "is", "with", "the", "a","an", "of", "and", "on", "to", "in" }
no_stop_words = (feedback.flatMap(lambda line: line.split())
                .map(lambda word : word.lower())
                .filter(lambda word: word not in stop_words))
no_stop_words.collect()

['ravi',
 'bangalore',
 'loved',
 'delivery',
 'meena',
 'hyderabad',
 'had',
 'late',
 'order',
 'ajay',
 'pune',
 'liked',
 'service',
 'anjali',
 'delhi',
 'faced',
 'ui',
 'issues',
 'rohit',
 'mumbai',
 'gave',
 'positive',
 'feedback']

In [10]:
# Count each word frequency using ReduceByKey
word_count =(feedback.flatMap(lambda feedback: feedback.split())
             .map(lambda w: (w.lower(), 1))
             .reduceByKey(lambda a,b: a+b))
word_count.collect()

[('from', 5),
 ('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('the', 2),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('had', 1),
 ('a', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1),
 ('gave', 1)]

In [11]:
# Find top 3 most frequent non-stop words
counts = no_stop_words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

top_3 = counts.takeOrdered(3, key=lambda x: -x[1])
print(top_3)

[('loved', 1), ('liked', 1), ('service', 1)]


### Module 3: DataFrames & Transformation (With Joins)


In [12]:
# Creating DataFrames students and attendance
students = [
("Amit", "10-A", 89),
("Kavya", "10-B", 92),
("Anjali", "10-A", 78),
("Rohit", "10-B", 85),
("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]

attendance = [
("Amit", 24),
("Kavya", 22),
("Anjali", 20),
("Rohit", 25),
("Sneha", 19)
]
columns2 = ["name", "days_present"]

In [13]:
df_students = spark.createDataFrame(students, columns)
df_students.show()

+------+-------+-----+
|  name|section|marks|
+------+-------+-----+
|  Amit|   10-A|   89|
| Kavya|   10-B|   92|
|Anjali|   10-A|   78|
| Rohit|   10-B|   85|
| Sneha|   10-C|   80|
+------+-------+-----+



In [14]:
df_attendance = spark.createDataFrame(attendance, columns2)
df_attendance.show()

+------+------------+
|  name|days_present|
+------+------------+
|  Amit|          24|
| Kavya|          22|
|Anjali|          20|
| Rohit|          25|
| Sneha|          19|
+------+------------+



In [15]:
# Join both DataFrames on name
df_info = df_students.join(df_attendance, on="name", how="inner")
df_info.show()

+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



In [16]:
# Create new column: attendance_rate = days_present / 25
from pyspark.sql.functions import col
df_attendance_rate = df_info.withColumn("attendance_rate", col("days_present")/25)
df_attendance_rate.show()


+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
|Anjali|   10-A|   78|          20|            0.8|
| Kavya|   10-B|   92|          22|           0.88|
| Rohit|   10-B|   85|          25|            1.0|
| Sneha|   10-C|   80|          19|           0.76|
+------+-------+-----+------------+---------------+



In [17]:
# Grade students using when and otherwise A: >90, B: 80–90, C: <80.
from pyspark.sql.functions import when
df_grade = df_info.withColumn("grade",
                              when(col("marks") > 90, "A")
                            .when((col("marks") >= 80 ) & (col("marks")<=90),"B")
                            .otherwise("C"))

df_grade.show()

+------+-------+-----+------------+-----+
|  name|section|marks|days_present|grade|
+------+-------+-----+------------+-----+
|  Amit|   10-A|   89|          24|    B|
|Anjali|   10-A|   78|          20|    C|
| Kavya|   10-B|   92|          22|    A|
| Rohit|   10-B|   85|          25|    B|
| Sneha|   10-C|   80|          19|    B|
+------+-------+-----+------------+-----+



In [18]:
# Filter students with good grades but poor attendance (<80%)

df_final = df_grade.join(df_attendance_rate.select("name", "attendance_rate"), on="name", how="inner")
df_filtered = df_final.filter(
    ((col("grade") == "A") | (col("grade") == "B")) & (col("attendance_rate") < 0.8)
)
df_filtered.show()

+-----+-------+-----+------------+-----+---------------+
| name|section|marks|days_present|grade|attendance_rate|
+-----+-------+-----+------------+-----+---------------+
|Sneha|   10-C|   80|          19|    B|           0.76|
+-----+-------+-----+------------+-----+---------------+



In [19]:
df_final.show()

+------+-------+-----+------------+-----+---------------+
|  name|section|marks|days_present|grade|attendance_rate|
+------+-------+-----+------------+-----+---------------+
|  Amit|   10-A|   89|          24|    B|           0.96|
|Anjali|   10-A|   78|          20|    C|            0.8|
| Kavya|   10-B|   92|          22|    A|           0.88|
| Rohit|   10-B|   85|          25|    B|            1.0|
| Sneha|   10-C|   80|          19|    B|           0.76|
+------+-------+-----+------------+-----+---------------+



### Module 4: Ingest CSV & JSON, Save to Parquet

In [20]:
# Ingesting csv and json
data_1 = """
 emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000
"""

with open('employees.csv', 'w') as file:
  file.write(data_1)



In [21]:
import json
data_2 = {

 "id": 201,
"name": "Nandini",
"contact": {
"email": "nandi@example.com",
"city": "Hyderabad"
},
"skills": ["Python", "Spark", "SQL"]
 }

with open('employees.json', 'w') as file:
  json.dump(data_2, file, indent=4)


In [22]:
#  Read both formats into DataFrames
df_csv = spark.read.csv('employees.csv', header=True, inferSchema=True)

df_csv.show()

df_json = spark.read.option("multiline", True).json('employees.json')
df_json.show()

+-------+-----+-------+---------+------+
| emp_id| name|   dept|     city|salary|
+-------+-----+-------+---------+------+
|    101| Anil|     IT|Bangalore| 80000|
|    102|Kiran|     HR|   Mumbai| 65000|
|    103|Deepa|Finance|  Chennai| 72000|
+-------+-----+-------+---------+------+

+--------------------+---+-------+--------------------+
|             contact| id|   name|              skills|
+--------------------+---+-------+--------------------+
|{Hyderabad, nandi...|201|Nandini|[Python, Spark, SQL]|
+--------------------+---+-------+--------------------+



In [23]:
# Flatten nested JSON using select ,col ,alias ,explode
from pyspark.sql.functions import explode

flattened = df_json.select(
    col('id'),
    col('name'),
    col('contact.email').alias('email'),
    col('contact.city').alias('city'),
    explode(col('skills')).alias('skills')
)

flattened.show()

+---+-------+-----------------+---------+------+
| id|   name|            email|     city|skills|
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad| Spark|
|201|Nandini|nandi@example.com|Hyderabad|   SQL|
+---+-------+-----------------+---------+------+



In [24]:
# Save both as Parquet files partitioned by city
df_csv.write.mode("overwrite").partitionBy("city").parquet("output/employees_csv")
flattened.write.mode("overwrite").partitionBy("city").parquet("output/employees_json")


### Module 5: Spark SQL with Temp Views


In [25]:
df_final.show()

+------+-------+-----+------------+-----+---------------+
|  name|section|marks|days_present|grade|attendance_rate|
+------+-------+-----+------------+-----+---------------+
|  Amit|   10-A|   89|          24|    B|           0.96|
|Anjali|   10-A|   78|          20|    C|            0.8|
| Kavya|   10-B|   92|          22|    A|           0.88|
| Rohit|   10-B|   85|          25|    B|            1.0|
| Sneha|   10-C|   80|          19|    B|           0.76|
+------+-------+-----+------------+-----+---------------+



In [26]:
# Register the students DataFrame as students_view .
df_final.createOrReplaceTempView('students_view')

In [27]:
# Average marks per section
spark.sql("select section, avg(marks) from students_view group by section").show()

+-------+----------+
|section|avg(marks)|
+-------+----------+
|   10-C|      80.0|
|   10-A|      83.5|
|   10-B|      88.5|
+-------+----------+



In [28]:
# Top scorer in each section
spark.sql("select section, name, marks from (select *, ROW_NUMBER() over(PARTITION BY section ORDER BY marks desc)as rnk from students_view) where rnk =1").show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



In [29]:
# Count of students in each grade category
spark.sql("select grade, count(*) from students_view group by grade").show()

+-----+--------+
|grade|count(1)|
+-----+--------+
|    B|       3|
|    C|       1|
|    A|       1|
+-----+--------+



In [30]:
# Students with marks above class average
spark.sql("select * from students_view where marks > (select avg(marks) from students_view)").show()

+-----+-------+-----+------------+-----+---------------+
| name|section|marks|days_present|grade|attendance_rate|
+-----+-------+-----+------------+-----+---------------+
| Amit|   10-A|   89|          24|    B|           0.96|
|Kavya|   10-B|   92|          22|    A|           0.88|
|Rohit|   10-B|   85|          25|    B|            1.0|
+-----+-------+-----+------------+-----+---------------+



In [34]:
# Attendance-adjusted performance
spark.sql("""
    SELECT
        name,
        section,
        marks,
        attendance_rate,
        ROUND(marks * (attendance_rate / 100), 2) AS adjusted_performance
    FROM students_view
""").show()

+------+-------+-----+---------------+--------------------+
|  name|section|marks|attendance_rate|adjusted_performance|
+------+-------+-----+---------------+--------------------+
|  Amit|   10-A|   89|           0.96|                0.85|
|Anjali|   10-A|   78|            0.8|                0.62|
| Kavya|   10-B|   92|           0.88|                0.81|
| Rohit|   10-B|   85|            1.0|                0.85|
| Sneha|   10-C|   80|           0.76|                0.61|
+------+-------+-----+---------------+--------------------+



### Module 6: Partitioned Data & Incremental Loading

In [40]:
# full load
df_final.write.partitionBy("section").parquet("output/students/")

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/content/output/students already exists. Set mode as "overwrite" to overwrite the existing path.

In [41]:
# Incremental Load
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

In [42]:
# List files in output/students/ using Python

!ls output/students/


'section=10-A'	'section=10-B'	'section=10-C'	 _SUCCESS


In [43]:
df_all = spark.read.parquet('output/students/')
df_all.show()

+------+-----+-------+
|  name|marks|section|
+------+-----+-------+
|Anjali|   78|   10-A|
| Tejas|   91|   10-A|
| Tejas|   91|   10-A|
| Rohit|   85|   10-B|
| Kavya|   92|   10-B|
| Sneha|   80|   10-C|
|  Amit|   89|   10-A|
+------+-----+-------+



In [45]:
# Read only partition 10-A and list students
df_10a = spark.read.parquet('output/students/section=10-A')
df_10a.show()


+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
| Tejas|   91|
|  Amit|   89|
+------+-----+



In [46]:
# Compare before/after counts for section 10-A
df_before = spark.read.parquet('output/students/')
count_before = df_before.filter("section = '10-A'").count()

df_after = spark.read.parquet('output/students/')
count_after = df_after.filter("section = '10-A'").count()

print(f"Section 10-A count before: {count_before}")
print(f"Section 10-A count after : {count_after}")


Section 10-A count before: 4
Section 10-A count after : 4


### Module 7: ETL Pipeline - End to End

In [48]:
# csv
csv_data = """
emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,
"""
with open('infer.csv', 'w') as file:
  file.write(csv_data)


In [49]:
# Creating df
df= spark.read.csv('infer.csv', header=True, inferSchema=True)
df.show()

+------+------+-------+------+------+
|emp_id|  name|   dept|salary|bonus |
+------+------+-------+------+------+
|     1| Arjun|     IT| 75000| 5000 |
|     2| Kavya|     HR| 62000|      |
|     3| Sneha|Finance| 68000| 4000 |
|     4|Ramesh|  Sales| 58000|  NULL|
+------+------+-------+------+------+



In [51]:
# Fill null
df_filled = df.fillna({"bonus ": 2000})
df_filled.show()

+------+------+-------+------+------+
|emp_id|  name|   dept|salary|bonus |
+------+------+-------+------+------+
|     1| Arjun|     IT| 75000| 5000 |
|     2| Kavya|     HR| 62000|      |
|     3| Sneha|Finance| 68000| 4000 |
|     4|Ramesh|  Sales| 58000|  2000|
+------+------+-------+------+------+



In [53]:
# TotalCTC
from pyspark.sql.functions import col

df_ctc = df_filled.withColumn("total_ctc", col("salary") + col("bonus "))
df_ctc.show()


+------+------+-------+------+------+---------+
|emp_id|  name|   dept|salary|bonus |total_ctc|
+------+------+-------+------+------+---------+
|     1| Arjun|     IT| 75000| 5000 |  80000.0|
|     2| Kavya|     HR| 62000|      |     NULL|
|     3| Sneha|Finance| 68000| 4000 |  72000.0|
|     4|Ramesh|  Sales| 58000|  2000|  60000.0|
+------+------+-------+------+------+---------+



In [54]:
# Filter employees with TotalCTC > 65000
df_filtered = df_ctc.filter(col("total_ctc") > 65000)
df_filtered.show()


+------+-----+-------+------+------+---------+
|emp_id| name|   dept|salary|bonus |total_ctc|
+------+-----+-------+------+------+---------+
|     1|Arjun|     IT| 75000| 5000 |  80000.0|
|     3|Sneha|Finance| 68000| 4000 |  72000.0|
+------+-----+-------+------+------+---------+



In [55]:
# Save result in json
df_filtered.write.mode("overwrite").json("output/etl/employees_json")


In [56]:
# Save result in parquet
df_filtered.write.mode("overwrite").partitionBy("dept").parquet("output/etl/employees_parquet")
