# Run this file in DataBricks

Introduction to Databricks
●	Install Databricks community edition and register on it. 
●	Explore the functionalities available in Databricks community Edition.

Q1. Consider  student online education and perform following operations on it.
●	 Apply basic transformation functions on it to get insights on the data . 
●	Perform preprocessing transformations on relevant features 
1.	Normalization
2.	Standardization
3.	Encoding
4.	Feature engineering
●	 	Create a SQL table of it to perform SQL transformations on it as below:
○	Filter students based on adaptability level
○	List students based on financial condition
○	Count student gender wise and age group wise also


STEP 1: Import required libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, avg, regexp_replace, expr # Import necessary functions
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler, StringIndexer, OneHotEncoder


STEP 2: Create Spark session

In [0]:
spark = SparkSession.builder.appName("StudentOnlineEducation").getOrCreate()

STEP 3: Load Dataset

In [0]:
# DBFS/Workspace path of your CSV
file_path = "/Volumes/workspace/default/student/students_adaptability_level_online_education.csv"

# Read CSV into Spark DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show first 5 rows
df.show(5)

# Print schema
df.printSchema()

# Rename columns to remove spaces
df = df.withColumnRenamed("Financial Condition", "Financial_Condition")
df = df.withColumnRenamed("Adaptivity Level", "Adaptivity_Level")

# 💡 CORRECTION: Replace 'try_cast' with a version-compatible method
# 1. Ensure 'Age' is a string and use regexp_extract to pull the first contiguous number.
# 2. Use CAST to convert this extracted number to BIGINT. This handles values like '21-25' 
#    by taking the '21' and turning any non-numeric result into a NULL.
df = df.withColumn("Age_String", col("Age").cast("string")) # Temporary column to ensure string operations work
df = df.withColumn("Age_Cleaned", 
                   expr("CAST(regexp_extract(Age_String, '(\\\\d+)', 1) AS BIGINT)")
)

# Calculate the mean of the cleaned ages, filtering out potential nulls first
mean_age_row = df.filter(col("Age_Cleaned").isNotNull()).agg(avg("Age_Cleaned")).collect()
mean_age = mean_age_row[0][0] if mean_age_row and mean_age_row[0][0] is not None else 0 # Defensive assignment

# Replace NULL (malformed) values with the calculated mean
df = df.withColumn("Age", 
                   when(col("Age_Cleaned").isNull(), mean_age)
                   .otherwise(col("Age_Cleaned"))
)

# Drop the temporary clean columns
df = df.drop("Age_Cleaned", "Age_String")

print("Schema after cleaning and imputation:")
df.printSchema()
df.show(5)

+------+-----+---------------+----------------+----------+--------+-------------+-------------------+-------------+------------+--------------+--------+------+----------------+
|Gender|  Age|Education Level|Institution Type|IT Student|Location|Load-shedding|Financial Condition|Internet Type|Network Type|Class Duration|Self Lms|Device|Adaptivity Level|
+------+-----+---------------+----------------+----------+--------+-------------+-------------------+-------------+------------+--------------+--------+------+----------------+
|   Boy|21-25|     University|  Non Government|        No|     Yes|          Low|                Mid|         Wifi|          4G|           3-6|      No|   Tab|        Moderate|
|  Girl|21-25|     University|  Non Government|        No|     Yes|         High|                Mid|  Mobile Data|          4G|           1-3|     Yes|Mobile|        Moderate|
|  Girl|16-20|        College|      Government|        No|     Yes|          Low|                Mid|         Wifi|

STEP 4: Basic Transformations / Insights

In [0]:
# Select important columns
df.select("Age", "Gender", "Financial_Condition", "Adaptivity_Level").show(5)

# Count number of students by gender
df.groupBy("Gender").count().show()

# Count adaptability levels
df.groupBy("Adaptivity_Level").count().show()

# Sort students by Age descending
df.orderBy(col("Age").desc()).show(5)

# Add new column - Age Group
df = df.withColumn(
    "Age_Group",
    when(col("Age") < 20, "Teenager")
    .when((col("Age") >= 20) & (col("Age") <= 25), "Young Adult")
    .otherwise("Adult")
)
df.select("Age", "Age_Group").show(5)

+----+------+-------------------+----------------+
| Age|Gender|Financial_Condition|Adaptivity_Level|
+----+------+-------------------+----------------+
|21.0|   Boy|                Mid|        Moderate|
|21.0|  Girl|                Mid|        Moderate|
|16.0|  Girl|                Mid|        Moderate|
|11.0|  Girl|                Mid|        Moderate|
|16.0|  Girl|               Poor|             Low|
+----+------+-------------------+----------------+
only showing top 5 rows
+------+-----+
|Gender|count|
+------+-----+
|  Girl|  542|
|   Boy|  663|
+------+-----+

+----------------+-----+
|Adaptivity_Level|count|
+----------------+-----+
|            High|  100|
|             Low|  480|
|        Moderate|  625|
+----------------+-----+

+------+----+---------------+----------------+----------+--------+-------------+-------------------+-------------+------------+--------------+--------+--------+----------------+---------+
|Gender| Age|Education Level|Institution Type|IT Student|Locat

 STEP 5: Preprocessing Transformations

In [0]:
# -------------------------------------------------------------
# ✅ STEP 5: DATA PREPROCESSING (WORKING VERSION)
# -------------------------------------------------------------
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# --- Ensure 'Age' is numeric and handle missing/null values ---
df = df.withColumn("Age", F.col("Age").cast(DoubleType()))
avg_age = df.select(F.mean("Age")).first()[0]
df = df.withColumn("Age", F.when(F.col("Age").isNull(), avg_age).otherwise(F.col("Age")))

# -------------------------------------------------------------
# 🧮 1. NORMALIZATION (Min-Max Scaling)
# -------------------------------------------------------------
min_age = df.agg(F.min("Age")).first()[0]
max_age = df.agg(F.max("Age")).first()[0]
df = df.withColumn("Age_Normalized", (F.col("Age") - min_age) / (max_age - min_age))

# -------------------------------------------------------------
# 📊 2. STANDARDIZATION (Z-Score)
# -------------------------------------------------------------
mean_age = df.agg(F.mean("Age")).first()[0]
std_age = df.agg(F.stddev("Age")).first()[0]
df = df.withColumn("Age_Standardized", (F.col("Age") - mean_age) / std_age)

# -------------------------------------------------------------
# 🧩 3. ENCODING (Manual Label Encoding for Gender)
# -------------------------------------------------------------
df = df.withColumn(
    "Gender_Encoded",
    F.when(F.col("Gender").isin("Boy", "Male"), 1)
     .when(F.col("Gender").isin("Girl", "Female"), 0)
     .otherwise(None)
)

# -------------------------------------------------------------
# 💡 4. FEATURE ENGINEERING
# -------------------------------------------------------------
# Create a new Age Group column
df = df.withColumn(
    "Age_Group",
    F.when(F.col("Age") < 15, "School Student")
     .when((F.col("Age") >= 15) & (F.col("Age") < 20), "College Student")
     .otherwise("University Student")
)

# Map Adaptivity_Level to numeric score for modeling
df = df.withColumn(
    "Experience_Score",
    F.when(F.col("Adaptivity_Level") == "High", 3)
     .when(F.col("Adaptivity_Level") == "Moderate", 2)
     .otherwise(1)
)

# -------------------------------------------------------------
# 🧾 5. DISPLAY FINAL OUTPUT
# -------------------------------------------------------------
display(df.select(
    "Gender",
    "Age",
    "Age_Normalized",
    "Age_Standardized",
    "Gender_Encoded",
    "Education Level",
    "Institution Type",
    "Financial_Condition",
    "Age_Group",
    "Adaptivity_Level",
    "Experience_Score"
))


Gender,Age,Age_Normalized,Age_Standardized,Gender_Encoded,Education Level,Institution Type,Financial_Condition,Age_Group,Adaptivity_Level,Experience_Score
Boy,21.0,0.8,0.9195931033624382,1,University,Non Government,Mid,University Student,Moderate,2
Girl,21.0,0.8,0.9195931033624382,0,University,Non Government,Mid,University Student,Moderate,2
Girl,16.0,0.6,0.1241087605399415,0,College,Government,Mid,College Student,Moderate,2
Girl,11.0,0.4,-0.6713755822825551,0,School,Non Government,Mid,School Student,Moderate,2
Girl,16.0,0.6,0.1241087605399415,0,School,Non Government,Poor,College Student,Low,1
Boy,11.0,0.4,-0.6713755822825551,1,School,Non Government,Poor,School Student,Low,1
Boy,11.0,0.4,-0.6713755822825551,1,School,Non Government,Mid,School Student,Low,1
Boy,11.0,0.4,-0.6713755822825551,1,School,Non Government,Mid,School Student,Moderate,2
Boy,16.0,0.6,0.1241087605399415,1,College,Government,Mid,College Student,Low,1
Boy,11.0,0.4,-0.6713755822825551,1,School,Non Government,Mid,School Student,Moderate,2


Step 6 – SQL Transformations

In [0]:
# Step 6: SQL Transformations

# Create a temporary SQL view from the DataFrame
df.createOrReplaceTempView("students")

# Display first few rows using SQL
spark.sql("SELECT * FROM students LIMIT 5").show()

# Query 1: Students having High Adaptivity Level
spark.sql("""
SELECT Gender,
       Age,
       `Education Level` AS Education_Level,
       `Institution Type` AS Institution_Type,
       Financial_Condition,
       Adaptivity_Level
FROM students
WHERE Adaptivity_Level = 'High'
ORDER BY Age
""").show(10, truncate=False)

# Query 2: List students based on Financial Condition
spark.sql("""
SELECT Gender,
       Age,
       `Education Level` AS Education_Level,
       `Institution Type` AS Institution_Type,
       Financial_Condition,
       Adaptivity_Level
FROM students
ORDER BY Financial_Condition
""").show(10, truncate=False)

# Query 3: Count students by Gender and Age Group
spark.sql("""
SELECT Gender,
       Age_Group,
       COUNT(*) AS Total_Students
FROM students
GROUP BY Gender, Age_Group
ORDER BY Gender, Age_Group
""").show()

# Query 4: Average Experience Score by Financial Condition
spark.sql("""
SELECT Financial_Condition,
       ROUND(AVG(Experience_Score), 2) AS Avg_Experience_Score
FROM students
GROUP BY Financial_Condition
ORDER BY Avg_Experience_Score DESC
""").show()


+------+----+---------------+----------------+----------+--------+-------------+-------------------+-------------+------------+--------------+--------+------+----------------+------------------+--------------+-------------------+--------------+----------------+
|Gender| Age|Education Level|Institution Type|IT Student|Location|Load-shedding|Financial_Condition|Internet Type|Network Type|Class Duration|Self Lms|Device|Adaptivity_Level|         Age_Group|Age_Normalized|   Age_Standardized|Gender_Encoded|Experience_Score|
+------+----+---------------+----------------+----------+--------+-------------+-------------------+-------------+------------+--------------+--------+------+----------------+------------------+--------------+-------------------+--------------+----------------+
|   Boy|21.0|     University|  Non Government|        No|     Yes|          Low|                Mid|         Wifi|          4G|           3-6|      No|   Tab|        Moderate|University Student|           0.8| 0.91