###1. PySpark Setup & Initialization

In [2]:
# Creating SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("BotCampus Intermediate Session") \
.master("local[*]") \
.getOrCreate()

spark

In [3]:
# Loading Data and creating spark df
data = [("Ananya", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



###2. RDDs & Transformations

In [4]:
# Creating RDD from feedback
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the mobile app",
"Meena from Delhi reported poor response time",
"Ajay from Pune liked the delivery speed",
"Ananya from Hyderabad had an issue with UI",
"Rohit from Mumbai gave positive feedback"
])

In [6]:
feedback.collect()

['Ravi from Bangalore loved the mobile app',
 'Meena from Delhi reported poor response time',
 'Ajay from Pune liked the delivery speed',
 'Ananya from Hyderabad had an issue with UI',
 'Rohit from Mumbai gave positive feedback']

In [7]:
# Count total number of words
word_count = (feedback.flatMap(lambda feedback: feedback.split())
              .map(lambda w: (w.lower(), 1))
               .reduceByKey(lambda a,b: a+b))
word_count.collect()

[('from', 5),
 ('loved', 1),
 ('app', 1),
 ('poor', 1),
 ('response', 1),
 ('liked', 1),
 ('speed', 1),
 ('ananya', 1),
 ('an', 1),
 ('issue', 1),
 ('with', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('the', 2),
 ('mobile', 1),
 ('meena', 1),
 ('delhi', 1),
 ('reported', 1),
 ('time', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delivery', 1),
 ('hyderabad', 1),
 ('had', 1),
 ('ui', 1),
 ('gave', 1)]

In [9]:
# Find top 3 most common words
# takeOrdered() returns a list
common_words = word_count.takeOrdered(3, key=lambda x: -x[1])
print(common_words)

[('from', 5), ('the', 2), ('loved', 1)]


In [11]:
# Remove Stop Words
# Creating a list of stop words
stop_words = {"from", "is", "with", "the", "a","an", "of", "and", "on", "to", "in" }
no_stop_words = (feedback.flatMap(lambda line: line.split())
                .map(lambda word : word.lower())
                .filter(lambda word: word not in stop_words))
no_stop_words.collect()

['ravi',
 'bangalore',
 'loved',
 'mobile',
 'app',
 'meena',
 'delhi',
 'reported',
 'poor',
 'response',
 'time',
 'ajay',
 'pune',
 'liked',
 'delivery',
 'speed',
 'ananya',
 'hyderabad',
 'had',
 'issue',
 'ui',
 'rohit',
 'mumbai',
 'gave',
 'positive',
 'feedback']

In [12]:
# Create dictionary of word -> count
# collectAsMap() is used to create dictionary
word_count.collectAsMap()

{'from': 5,
 'loved': 1,
 'app': 1,
 'poor': 1,
 'response': 1,
 'liked': 1,
 'speed': 1,
 'ananya': 1,
 'an': 1,
 'issue': 1,
 'with': 1,
 'rohit': 1,
 'mumbai': 1,
 'positive': 1,
 'feedback': 1,
 'ravi': 1,
 'bangalore': 1,
 'the': 2,
 'mobile': 1,
 'meena': 1,
 'delhi': 1,
 'reported': 1,
 'time': 1,
 'ajay': 1,
 'pune': 1,
 'delivery': 1,
 'hyderabad': 1,
 'had': 1,
 'ui': 1,
 'gave': 1}

### 3. DataFrames - Transformations

In [13]:
# Creating exam_scores DataFrame
scores = [
("Ravi", "Math", 88),
("Ananya", "Science", 92),
("Kavya", "English", 79),
("Ravi", "English", 67),
("Neha", "Math", 94),
("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

In [15]:
df_scores.show()

+------+-------+-----+
|  name|subject|score|
+------+-------+-----+
|  Ravi|   Math|   88|
|Ananya|Science|   92|
| Kavya|English|   79|
|  Ravi|English|   67|
|  Neha|   Math|   94|
| Meena|Science|   85|
+------+-------+-----+



In [16]:
# Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, elsve D)
from pyspark.sql.functions import when, col, avg
df_grade = df_scores.withColumn("Grade",
           when(col("score") >= 90, "A")
          .when((col("score") >= 80) & (col("score") < 90), "B")
          .when((col("score") >= 70) & (col("score") < 80), "C")
          .otherwise("D"))
df_grade.show()


+------+-------+-----+-----+
|  name|subject|score|Grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



In [17]:
# Group by subject, find average score
avg_score = df_scores.groupBy("subject").agg(avg("score").alias("avg_score"))
avg_score.show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
|English|     73.0|
+-------+---------+



In [19]:
# Use when and otherwise to classify subject difficulty ( Math/Science =Difficult).
difficult = df_scores.withColumn("Status", when((col("subject") == "Math") | (col("subject") == "Science"), "Difficult")
                                  .otherwise("Easy"))
difficult.show()

+------+-------+-----+---------+
|  name|subject|score|   Status|
+------+-------+-----+---------+
|  Ravi|   Math|   88|Difficult|
|Ananya|Science|   92|Difficult|
| Kavya|English|   79|     Easy|
|  Ravi|English|   67|     Easy|
|  Neha|   Math|   94|Difficult|
| Meena|Science|   85|Difficult|
+------+-------+-----+---------+



In [20]:
# Rank students per subject using Window function.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

win = Window.partitionBy("subject").orderBy(col("score").desc())
score_rank = df_scores.withColumn("rank", rank().over(win))
score_rank.show()


+------+-------+-----+----+
|  name|subject|score|rank|
+------+-------+-----+----+
| Kavya|English|   79|   1|
|  Ravi|English|   67|   2|
|  Neha|   Math|   94|   1|
|  Ravi|   Math|   88|   2|
|Ananya|Science|   92|   1|
| Meena|Science|   85|   2|
+------+-------+-----+----+



In [21]:
# Apply UDF to format names (e.g., make all uppercase)
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# User defined function
def u_name(name):
  return name.upper()

u_name_udf = udf(u_name, StringType())

format_df = df_scores.withColumn("formatted_name", u_name_udf(col("name")))
format_df.show()

+------+-------+-----+--------------+
|  name|subject|score|formatted_name|
+------+-------+-----+--------------+
|  Ravi|   Math|   88|          RAVI|
|Ananya|Science|   92|        ANANYA|
| Kavya|English|   79|         KAVYA|
|  Ravi|English|   67|          RAVI|
|  Neha|   Math|   94|          NEHA|
| Meena|Science|   85|         MEENA|
+------+-------+-----+--------------+



### 4. Ingest CSV & JSON = Save to Parquet

In [23]:
# Dataset 1
data = """
id,name,department,city,salary
1,Amit,IT,Bangalore,78000
2,Kavya,HR,Chennai,62000
3,Arjun,Finance,Hyderabad,55000
"""
with open('students.csv', 'w')as file:
  file.write(data)

In [24]:
# Dataset 2
import json
data_2 = [
  {
    "id": 101,
    "name": "Sneha",
    "address": {
    "city": "Mumbai",
    "pincode": 400001
  },
    "skills": ["Python", "Spark"]
  }
]

with open('employees_nested.json', 'w') as f:
  json.dump( data_2, f, indent=4)

In [28]:
# Load both datasets into pySpark
students_df = spark.read.csv("students.csv",header=True, inferSchema=True)
students_df.show()

employees_df = spark.read.option("multiline", "true").json("employees_nested.json")
employees_df.show()

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+

+----------------+---+-----+---------------+
|         address| id| name|         skills|
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+



In [29]:
# Print schema and infer nested structure
students_df.printSchema()
employees_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [30]:
# Flatten the JSON (use explode , select , alias )
from pyspark.sql.functions import explode

flat_df = employees_df.select(col("id"), col("name"), col("address.city").alias("city"), col("address.pincode").alias("pincode"), explode(col("skills")).alias("Skill"))
flat_df.show()

+---+-----+------+-------+------+
| id| name|  city|pincode| Skill|
+---+-----+------+-------+------+
|101|Sneha|Mumbai| 400001|Python|
|101|Sneha|Mumbai| 400001| Spark|
+---+-----+------+-------+------+



In [31]:
# Convert both to Parquet and write to /tmp/output
students_df.write.mode("overwrite").parquet("/tmp/output/students_parquet")
employees_df.write.mode("overwrite").parquet("/tmp/output/employees_parquet")

In [33]:
# Downloading files
!zip -r /tmp/output_parquet.zip /tmp/output

  adding: tmp/output/ (stored 0%)
  adding: tmp/output/students_parquet/ (stored 0%)
  adding: tmp/output/students_parquet/._SUCCESS.crc (stored 0%)
  adding: tmp/output/students_parquet/part-00000-64136b2b-7efd-4a40-a985-a0295683abb8-c000.snappy.parquet (deflated 48%)
  adding: tmp/output/students_parquet/_SUCCESS (stored 0%)
  adding: tmp/output/students_parquet/.part-00000-64136b2b-7efd-4a40-a985-a0295683abb8-c000.snappy.parquet.crc (stored 0%)
  adding: tmp/output/employees_parquet/ (stored 0%)
  adding: tmp/output/employees_parquet/._SUCCESS.crc (stored 0%)
  adding: tmp/output/employees_parquet/.part-00000-a9c0a368-b850-4524-b349-708a7bee35e9-c000.snappy.parquet.crc (stored 0%)
  adding: tmp/output/employees_parquet/_SUCCESS (stored 0%)
  adding: tmp/output/employees_parquet/part-00000-a9c0a368-b850-4524-b349-708a7bee35e9-c000.snappy.parquet (deflated 53%)


In [34]:
from google.colab import files
files.download("/tmp/output_parquet.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### 5. Spark SQL - Temp Views & Queries

In [35]:
data = [
    ("Kavya", "English", 79, "B"),
    ("Ravi", "English", 67, "C"),
    ("Neha", "Math", 94, "A"),
    ("Ravi", "Math", 88, "A"),
    ("Ananya", "Science", 92, "A"),
    ("Meena", "Science", 85, "B"),
    ("Neha", "English", 88, "A"),
    ("Ravi", "Science", 91, "A"),
]
columns = ["name", "subject", "score", "grade"]

df_stuents = spark.createDataFrame(data, columns)
df_stuents.createOrReplaceTempView("exam_scores")

In [36]:
spark.sql("select * from exam_scores").show()

+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
| Kavya|English|   79|    B|
|  Ravi|English|   67|    C|
|  Neha|   Math|   94|    A|
|  Ravi|   Math|   88|    A|
|Ananya|Science|   92|    A|
| Meena|Science|   85|    B|
|  Neha|English|   88|    A|
|  Ravi|Science|   91|    A|
+------+-------+-----+-----+



In [37]:
# Top scorer per subject
spark.sql("Select subject, name, score from(select *, RANK() OVER(PARTITION BY subject ORDER BY score DESC)as rnk from exam_scores)where rnk=1").show()

+-------+------+-----+
|subject|  name|score|
+-------+------+-----+
|English|  Neha|   88|
|   Math|  Neha|   94|
|Science|Ananya|   92|
+-------+------+-----+



In [38]:
# Count of students per grade
spark.sql("Select grade, count(*) as student_count from exam_scores group by grade").show()

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    B|            2|
|    C|            1|
|    A|            5|
+-----+-------------+



In [39]:
# Students with multiple subjects
spark.sql("select name, count(DISTINCT subject)as subject_count from exam_scores group by name having subject_count >1").show()

+----+-------------+
|name|subject_count|
+----+-------------+
|Ravi|            3|
|Neha|            2|
+----+-------------+



In [40]:
# Subjects with average score above 85
spark.sql("select subject, avg(score)as AvgScore from exam_scores group by subject having AvgScore > 85").show()

+-------+-----------------+
|subject|         AvgScore|
+-------+-----------------+
|   Math|             91.0|
|Science|89.33333333333333|
+-------+-----------------+



In [41]:
# Create another DataFrame attendance(name, days_present) and:
attendance_data = [
    ("Kavya", 42),
    ("Ravi", 35),
    ("Neha", 48),
    ("Ananya", 50),
    ("Meena", 38)
]

attendance_columns = ["name", "days_present"]

df_attendance = spark.createDataFrame(attendance_data, attendance_columns)
df_attendance.show()


+------+------------+
|  name|days_present|
+------+------------+
| Kavya|          42|
|  Ravi|          35|
|  Neha|          48|
|Ananya|          50|
| Meena|          38|
+------+------------+



In [42]:
# Joining attendance df with students df
df_join = df_stuents.join(df_attendance, on="name", how="inner")
df_join.show()

+------+-------+-----+-----+------------+
|  name|subject|score|grade|days_present|
+------+-------+-----+-----+------------+
|Ananya|Science|   92|    A|          50|
| Kavya|English|   79|    B|          42|
| Meena|Science|   85|    B|          38|
|  Neha|   Math|   94|    A|          48|
|  Neha|English|   88|    A|          48|
|  Ravi|English|   67|    C|          35|
|  Ravi|   Math|   88|    A|          35|
|  Ravi|Science|   91|    A|          35|
+------+-------+-----+-----+------------+



In [44]:
# Calculating Attendance

def downgrade(grade, days_present):
    if days_present < 20:
        downgrade_map = {"A": "B", "B": "C", "C": "D", "D": "D"}
        return downgrade_map.get(grade, grade)
    return grade

downgrade_udf = udf(downgrade, StringType())

df_adjusted = df_join.withColumn("adjusted_grade", downgrade_udf(col("grade"), col("days_present")))

df_adjusted.show()

+------+-------+-----+-----+------------+--------------+
|  name|subject|score|grade|days_present|adjusted_grade|
+------+-------+-----+-----+------------+--------------+
|Ananya|Science|   92|    A|          50|             A|
| Kavya|English|   79|    B|          42|             B|
| Meena|Science|   85|    B|          38|             B|
|  Neha|   Math|   94|    A|          48|             A|
|  Neha|English|   88|    A|          48|             A|
|  Ravi|English|   67|    C|          35|             C|
|  Ravi|   Math|   88|    A|          35|             A|
|  Ravi|Science|   91|    A|          35|             A|
+------+-------+-----+-----+------------+--------------+



###6. Partitioned Load (Full + Incremental)

In [51]:
# Initial load
df_scores.write.partitionBy("subject").parquet("/tmp/scores/")

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/tmp/scores already exists. Set mode as "overwrite" to overwrite the existing path.

In [54]:
# Incremental load
incremental = [("Meena", "Math", 93)]
df_inc = spark.createDataFrame(incremental, ["name", "subject", "score"])
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

In [55]:
!ls /tmp/scores/

'subject=English'  'subject=Math'  'subject=Science'   _SUCCESS


In [56]:
df_math = spark.read.parquet("/tmp/scores/subject=Math")
df_math.show()

+-----+-----+
| name|score|
+-----+-----+
|Meena|   93|
|Meena|   93|
| Neha|   94|
| Ravi|   88|
+-----+-----+



### ETL: Clean, Transform, Load

In [57]:
raw = """
emp_id,name,dept,salary,bonus
1,Arjun,IT,78000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,55000,3000
"""
with open ('etl.csv', 'w') as file:
  file.write(raw)

In [58]:
# Load data with header
etl_df = spark.read.csv('etl.csv', header=True, inferSchema=True)
etl_df.show()

+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| NULL|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [60]:
# Fill missing bonus with 2000.
filled_df = etl_df.fillna({'bonus': 2000})
filled_df.show()

+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| 2000|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [61]:
# Calculate total_ctc = salary + bonus

df_ctc = filled_df.withColumn("TotalCTC", col("salary") + col("bonus"))
df_ctc.show()

+------+-----+-------+------+-----+--------+
|emp_id| name|   dept|salary|bonus|TotalCTC|
+------+-----+-------+------+-----+--------+
|     1|Arjun|     IT| 78000| 5000|   83000|
|     2|Kavya|     HR| 62000| 2000|   64000|
|     3|Sneha|Finance| 55000| 3000|   58000|
+------+-----+-------+------+-----+--------+



In [62]:
# Filter where total_ctc > 60000
filtered_df = df_ctc.filter(col("TotalCTC") > 60000)
filtered_df.show()

+------+-----+----+------+-----+--------+
|emp_id| name|dept|salary|bonus|TotalCTC|
+------+-----+----+------+-----+--------+
|     1|Arjun|  IT| 78000| 5000|   83000|
|     2|Kavya|  HR| 62000| 2000|   64000|
+------+-----+----+------+-----+--------+



In [64]:
# Save to Parquet and JSON
filtered_df.write.mode("overwrite").parquet("/tmp/final_employees_parquet")
filtered_df.write.mode("overwrite").json("/tmp/final_employees_json")
