###Bronze Layer

In [0]:
df=spark.table("Career_Guidance")

In [0]:
from pyspark.sql.functions import col, count

###Silver Layer

In [0]:
# 3.1: Check for duplicates and remove if any
original_count = df.count()
df = df.dropDuplicates()
dedup_count = df.count()
print(f"Original rows: {original_count}, After dedup: {dedup_count}")

Original rows: 1000, After dedup: 1000


In [0]:
# 3.2: Handle data types (ensure Salary_LPA is double, ID is int)
df = df.withColumn("Salary_LPA", col("Salary_LPA").cast("double"))
df = df.withColumn("ID", col("ID").cast("int"))

In [0]:
from pyspark.sql.functions import col, trim

string_cols = [
    "Sector",
    "Stream",
    "Course",
    "Qualification",
    "Career",
    "Demand_2025"
]

for col_name in string_cols:
    df = df.withColumn(
        col_name,
        trim(col(col_name).cast("string"))
    )

display(df.limit(5))

ID,Sector,Stream,Course,Qualification,Career,Salary_LPA,Demand_2025
1,AI & ML,Commerce,B.Com,10+2 + Bachelor's,Prompt Engineer (AI & ML),9.6,High
2,Engineering,Commerce,BHM,10+2 + Diploma,Electrical Engineer (Engineering),11.1,Stable
3,Green Energy,Science (PCB),B.Sc Nursing,10+2 + NEET,Renewable Energy Engineer (Green Energy),12.6,Evergreen
4,Science & Research,Any Stream,LLB,Bachelor's + UPSC,Biotechnologist (Science & Research),4.0,Emerging
5,Hospitality & Travel,Science (PCM),B.Tech Civil,10+2 + JEE,Hotel Manager (Hospitality & Travel),11.2,High


In [0]:
# Display preprocessed data
display(df.limit(5))

ID,Sector,Stream,Course,Qualification,Career,Salary_LPA,Demand_2025
1,AI & ML,Commerce,B.Com,10+2 + Bachelor's,Prompt Engineer (AI & ML),9.6,High
2,Engineering,Commerce,BHM,10+2 + Diploma,Electrical Engineer (Engineering),11.1,Stable
3,Green Energy,Science (PCB),B.Sc Nursing,10+2 + NEET,Renewable Energy Engineer (Green Energy),12.6,Evergreen
4,Science & Research,Any Stream,LLB,Bachelor's + UPSC,Biotechnologist (Science & Research),4.0,Emerging
5,Hospitality & Travel,Science (PCM),B.Tech Civil,10+2 + JEE,Hotel Manager (Hospitality & Travel),11.2,High


###Gold Layer

In [0]:
# 4.2: Demand distribution
demand_dist = df.groupBy("Demand_2025").agg(count("*").alias("Count")).orderBy(desc("Count"))
display(demand_dist)

Demand_2025,Count
Emerging,205
Evergreen,204
Growing,201
High,196
Stable,194


Databricks visualization. Run in Databricks to view.

In [0]:
# 4.3: Top careers by salary
top_careers = df.orderBy(desc("Salary_LPA")).limit(10)
display(top_careers)

ID,Sector,Stream,Course,Qualification,Career,Salary_LPA,Demand_2025
23,FinTech,Science (PCM/CS),B.Tech Aerospace,10+2 + JEE,Blockchain Specialist (FinTech),25.0,Stable
608,Cybersecurity,Science (PCM/CS),B.Sc Data Science,Bachelor's + Cert,Security Analyst (Cybersecurity) Variant 520,24.9,High
896,Government & Defense,Science (PCM/CS),B.Sc Data Science,10+2 + Diploma,Metaverse Designer (Government),24.9,Emerging
95,Law,Science (PCB),BDS,10+2 + NEET,Judge (Law),24.9,Stable
180,Education,Science (PCM),B.Tech CSE,10+2 + JEE,Professor (Education) Variant 106,24.9,Emerging
80,Engineering,Science (PCM/CS),B.Sc Data Science,10+2 + Diploma,Aerospace Engineer (Engineering),24.9,High
346,Engineering,Science (PCM/CS),B.Tech CSE,10+2 + JEE,Aerospace Engineer (Engineering) Variant 264,24.8,Emerging
207,Media & Design,Science (PCM/CS),B.Tech Aerospace,10+2 + JEE,Journalist (Media & Design) Variant 129,24.8,Evergreen
546,Government & Defense,Any Stream,LLB,Bachelor's + UPSC,Policy Maker (Government & Defense) Variant 460,24.8,Stable
163,Gig Economy,Science (PCM/CS),B.Tech Aerospace,10+2 + JEE,Platform Developer (Gig Economy) Variant 90,24.7,Growing


In [0]:
# 4.4: Register as temp table for SQL analytics
df.createOrReplaceTempView("careers")
sql_query = """
SELECT Stream, COUNT(*) as Count, AVG(Salary_LPA) as Avg_Salary
FROM careers
GROUP BY Stream
ORDER BY Avg_Salary DESC
"""
sql_result = spark.sql(sql_query)
display(sql_result)

Stream,Count,Avg_Salary
Science (PCM/CS),119,14.615126050420166
Any Stream,191,14.290575916230368
Science (PCM),257,13.859922178988338
Arts,81,13.670370370370378
Commerce,117,13.607692307692307
Science (PCB),235,13.58042553191489


Databricks visualization. Run in Databricks to view.

###Big Data Analytics

In [0]:
from pyspark.ml.feature import StringIndexer

indexers = [
    StringIndexer(
        inputCol=col,
        outputCol=col + "_index"
    )
    for col in [
        "Sector",
        "Stream",
        "Course",
        "Qualification",
        "Demand_2025"
    ]
]

In [0]:
from pyspark.ml.feature import OneHotEncoder

encoders = [
    OneHotEncoder(
        inputCol=col + "_index",
        outputCol=col + "_ohe"
    )
    for col in [
        "Sector",
        "Stream",
        "Course",
        "Qualification",
        "Demand_2025"
    ]
]

In [0]:
# 5.3: Assemble features into a vector (for potential ML)
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["Salary_LPA"] + [
        col + "_ohe"
        for col in [
            "Sector",
            "Stream",
            "Course",
            "Qualification",
            "Demand_2025"
        ]
    ],
    outputCol="features"
)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["Salary_LPA"] + [
        col + "_ohe"
        for col in [
            "Sector",
            "Stream",
            "Course",
            "Qualification",
            "Demand_2025"
        ]
    ],
    outputCol="features"
)

In [0]:
# Display extracted features
from pyspark.ml import Pipeline

pipeline = Pipeline(
    stages=indexers + encoders + [assembler]
)
model = pipeline.fit(df)
df_features = model.transform(df)

### **Platinum Layer**

> ###A student can select any column by filter as his preference and get results

In [0]:
display(df)

ID,Sector,Stream,Course,Qualification,Career,Salary_LPA,Demand_2025
1,AI & ML,Commerce,B.Com,10+2 + Bachelor's,Prompt Engineer (AI & ML),9.6,High
2,Engineering,Commerce,BHM,10+2 + Diploma,Electrical Engineer (Engineering),11.1,Stable
3,Green Energy,Science (PCB),B.Sc Nursing,10+2 + NEET,Renewable Energy Engineer (Green Energy),12.6,Evergreen
4,Science & Research,Any Stream,LLB,Bachelor's + UPSC,Biotechnologist (Science & Research),4.0,Emerging
5,Hospitality & Travel,Science (PCM),B.Tech Civil,10+2 + JEE,Hotel Manager (Hospitality & Travel),11.2,High
6,Gig Economy,Any Stream,LLB,10+2 + Diploma,Gig Worker Coordinator (Gig Economy),11.7,Stable
7,Hospitality & Travel,Science (PCB),B.Sc Nursing,10+2 + NEET,Travel Agent (Hospitality & Travel),13.2,Evergreen
8,Cybersecurity,Science (PCM),B.Sc Data Science,10+2 + Diploma,Incident Responder (Cybersecurity),9.8,Evergreen
9,AI & ML,Science (PCM/CS),B.Tech Aerospace,10+2 + JEE,Prompt Engineer (AI & ML) Variant 1,19.2,Evergreen
10,Engineering,Science (PCB),B.Sc Nursing,10+2 + NEET,EV Designer (Engineering),21.6,Evergreen


Databricks visualization. Run in Databricks to view.