In [1]:
# Import Required Libraries
import os
import pandas as pd
from datetime import datetime, timedelta
import findspark

findspark.init()



# PySpark imports
try:
    from pyspark.sql import SparkSession
    pyspark_available = True
    print("PySpark is available!")
except ImportError:
    print("PySpark not found. Please install with: pip install pyspark")
    pyspark_available = False




PySpark is available!


In [2]:
if pyspark_available:
    # Create SparkSession with custom configuration
    spark = SparkSession.builder \
        .appName("PySpark") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()
    
    # Set log level to reduce verbose output
    spark.sparkContext.setLogLevel("WARN")
    
    print("✓ SparkSession created successfully!")
    print(f"Spark Version: {spark.version}")
    print(f"Application Name: {spark.sparkContext.appName}")
    print(f"Master: {spark.sparkContext.master}")
    
    # Check available cores and memory
    print(f"Default Parallelism: {spark.sparkContext.defaultParallelism}")
    
else:
    print("Cannot proceed without PySpark. Please install PySpark first.")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/27 16:25:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✓ SparkSession created successfully!
Spark Version: 3.5.0
Application Name: PySpark
Master: local[*]
Default Parallelism: 4


In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest


cleaned_data = spark.read.parquet("./data/silver/silverResult")


# Use only numeric features for now
numeric_fields = ["Hours_Studied", "Attendance", "Sleep_Hours",
                  "Previous_Scores", "Tutoring_Sessions", "Physical_Activity"]



# Categorical fields: cast to string, replace missing with 'Unknown'
categorical_fields = ["Parental_Involvement", "Access_to_Resources", "Extracurricular_Activities",
                    "Motivation_Level", "Internet_Access", "Family_Income", "Teacher_Quality",
                    "School_Type", "Peer_Influence", "Learning_Disabilities", 
                    "Parental_Education_Level", "Distance_from_Home", "Gender"]

cleaned_data = cleaned_data.dropna(subset=numeric_fields + categorical_fields + ["Exam_Score"])

# Ensure numeric fields are double
for num in numeric_fields + ["Exam_Score"]:
    cleaned_data = cleaned_data.withColumn(num, col(num).cast("double"))

correlations = {}

for colname in numeric_fields:
    corr_value = cleaned_data.stat.corr(colname, "Exam_Score")
    correlations[colname] = corr_value

print("Correlation for the numerical fields")
for k, v in correlations.items():
    print(f"{k}: {v}")
print("")
# Calculate eta
#  ANOVA  Eta-Squared
pdf = cleaned_data.select(categorical_fields + ["Exam_Score"]).toPandas()

eta_squared_results = {}

for cat in categorical_fields:
    groups = [group["Exam_Score"].values for _, group in pdf.groupby(cat)]
    
    # Calculate eta squared = SS_between / SS_total
    overall_mean = pdf["Exam_Score"].mean()
    ss_total = ((pdf["Exam_Score"] - overall_mean) ** 2).sum()
    ss_between = sum(len(g) * (g.mean() - overall_mean)**2 for g in groups)
    eta2 = ss_between / ss_total
    eta_squared_results[cat] = round(eta2, 3)

combined_variance = {}

# Numeric: r²
for k, v in correlations.items():
    combined_variance[k] = v**2

# Categorical: eta²
for k, v in eta_squared_results.items():
    combined_variance[k] = v


sorted_variance = dict(sorted(combined_variance.items(), key=lambda item: item[1], reverse=True))

# eta² and r², are not identical values, but are roughly the same, so we can compare
print("Variance of fields:")  
for feature, var_exp in sorted_variance.items():
    print(f"{feature}: {var_exp:.3f}")

rows = [Row(Feature=k, Variance=round(float(v), 3)) for k, v in sorted_variance.items()]
variance_df = spark.createDataFrame(rows)
variance_df.show()


########################## Prediction #######################
assembler = VectorAssembler(inputCols=numeric_fields, outputCol="features")
data = assembler.transform(cleaned_data).select("features", "Exam_Score")

# Split dataset
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)
train_data = train_data.limit(6000) 

# Train lightweight model
lr = LinearRegression(featuresCol="features", labelCol="Exam_Score")
model = lr.fit(train_data)

print("Coefficients for the numerical fields")
# Show feature importances (coefficients)
for feature, coef in zip(numeric_fields, model.coefficients):
    print(f"{feature}: {coef}")

print(f"Intercept: {model.intercept}")

# Make predictions
predictions = model.transform(test_data)
predictions.select("Exam_Score", "prediction").show(10)

# Evaluate
evaluator = RegressionEvaluator(labelCol="Exam_Score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")


print(f"Number of rows: {data.count()}")
print(f"Number of partitions: {data.rdd.getNumPartitions()}")

cleaned_data_without_exam = spark.read.parquet("./data/silver/silverResultForStudentsWithoutExam")
cleaned_data_without_exam = assembler.transform(cleaned_data_without_exam)

predicted_df = model.transform(cleaned_data_without_exam) \
                    .withColumnRenamed("prediction", "Predicted_Exam_Score")
predicted_df.select("Student_ID", "Predicted_Exam_Score").show(10)

# Storing student info and their predicted exam score
predicted_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("./data/gold/gold_student_predictions")

# Storing variance of each category field i.e sleep_hours
variance_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("./data/gold/gold_feature_variance")

spark.stop()


                                                                                

Correlation for the numerical fields
Hours_Studied: 0.4418645699008704
Attendance: 0.5817189471046391
Sleep_Hours: -0.017758159802785355
Previous_Scores: 0.17227893475096628
Tutoring_Sessions: 0.15402576160955206
Physical_Activity: 0.025237375089293388

Variance of fields:
Attendance: 0.338
Hours_Studied: 0.195
Previous_Scores: 0.030
Access_to_Resources: 0.028
Parental_Involvement: 0.025
Tutoring_Sessions: 0.024
Parental_Education_Level: 0.011
Peer_Influence: 0.010
Family_Income: 0.009
Motivation_Level: 0.008
Distance_from_Home: 0.008
Learning_Disabilities: 0.007
Teacher_Quality: 0.006
Extracurricular_Activities: 0.004
Internet_Access: 0.003
Physical_Activity: 0.001
Sleep_Hours: 0.000
School_Type: 0.000
Gender: 0.000
+--------------------+--------+
|             Feature|Variance|
+--------------------+--------+
|          Attendance|   0.338|
|       Hours_Studied|   0.195|
|     Previous_Scores|    0.03|
| Access_to_Resources|   0.028|
|Parental_Involvement|   0.025|
|   Tutoring_Sess

25/11/27 16:26:00 WARN Instrumentation: [88ceae76] regParam is zero, which might cause numerical instability and overfitting.
25/11/27 16:26:00 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/11/27 16:26:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Coefficients for the numerical fields
Hours_Studied: 0.29206437145547837
Attendance: 0.19874336767816414
Sleep_Hours: -0.029951364368375175
Previous_Scores: 0.047634604855014126
Tutoring_Sessions: 0.46894348479899794
Physical_Activity: 0.12917150729624682
Intercept: 41.07062426373279
+----------+-----------------+
|Exam_Score|       prediction|
+----------+-----------------+
|      65.0|65.97730584928655|
|      62.0|60.61761918684618|
|      63.0|64.41548958884157|
|      61.0|60.13833170843973|
|      64.0|66.48930298079254|
|      61.0|61.32130055919798|
|      63.0|63.09438335629221|
|      63.0| 63.8381955877407|
|      63.0|61.77880786151544|
|      66.0|65.40692208923832|
+----------+-----------------+
only showing top 10 rows

RMSE: 2.447822507503709
Number of rows: 6378
Number of partitions: 4
+----------+--------------------+
|Student_ID|Predicted_Exam_Score|
+----------+--------------------+
|    7774.0|   68.65705301605867|
|    7776.0|   67.63975379803064|
|    7786.0|   7