In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import importlib.util
import os
from pyspark.sql.functions import broadcast
from pyspark.mllib.evaluation import MulticlassMetrics


# Load preprocessing module
spec = importlib.util.spec_from_file_location("utils", '../scripts/preprocessing.py')
preprocess = importlib.util.module_from_spec(spec)
spec.loader.exec_module(preprocess)
print(preprocess.test())
# Standard library imports
import os
import sys
import pwd
import time
import json
import re
import warnings
import base64 as b64
from contextlib import closing
from urllib.parse import urlparse
from random import randrange
from itertools import chain

# Third-party imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import trino
from trino.dbapi import connect
from trino.auth import BasicAuthentication, JWTAuthentication
import seaborn as sns

# PySpark core
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType, IntegerType, DateType

# PySpark SQL functions
from pyspark.sql import functions as F
from pyspark.sql.functions import rank, col, avg, date_format, count, year,expr, coalesce, lit, to_timestamp,unix_timestamp, hour, month, when,create_map, monotonically_increasing_id

# PySpark ML
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler, StandardScaler,IndexToString
from pyspark.ml.classification import LogisticRegression,GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.functions import vector_to_array
#setup spark session and trino
username = pwd.getpwuid(os.getuid()).pw_name
hadoopFS=os.getenv('HADOOP_FS', None)
groupName = 'U1'
print(os.getenv('SPARK_HOME'))
print(f"hadoopFSs={hadoopFS}")
print(f"username={username}")
print(f"group={groupName}")

spark = SparkSession\
            .builder\
            .appName(pwd.getpwuid(os.getuid()).pw_name)\
            .config('spark.ui.port', randrange(4040, 4440, 5))\
            .config("spark.executorEnv.PYTHONPATH", ":".join(sys.path)) \
            .config('spark.jars', f'{hadoopFS}/data/com-490/jars/iceberg-spark-runtime-3.5_2.13-1.6.1.jar')\
            .config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')\
            .config('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog')\
            .config('spark.sql.catalog.iceberg.type', 'hadoop')\
            .config('spark.sql.catalog.iceberg.warehouse', f'{hadoopFS}/data/com-490/iceberg/')\
            .config('spark.sql.catalog.spark_catalog', 'org.apache.iceberg.spark.SparkSessionCatalog')\
            .config('spark.sql.catalog.spark_catalog.type', 'hadoop')\
            .config('spark.sql.catalog.spark_catalog.warehouse', f'{hadoopFS}/user/{username}/assignment-3/warehouse')\
            .config("spark.sql.warehouse.dir", f'{hadoopFS}/user/{username}/assignment-3/spark/warehouse')\
            .config('spark.eventLog.gcMetrics.youngGenerationGarbageCollectors', 'G1 Young Generation')\
            .config("spark.executor.memory", "6g")\
            .config("spark.executor.cores", "4")\
            .config("spark.executor.instances", "4")\
            .master('yarn')\
            .getOrCreate()

warnings.simplefilter(action='ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=UserWarning, message="pandas only supports SQLAlchemy connectable .*")



def getUsername():
    payload = os.environ.get('EPFL_COM490_TOKEN').split('.')[1]
    payload=payload+'=' * (4 - len(payload) % 4)
    obj = json.loads(b64.urlsafe_b64decode(payload))
    if (time.time() > int(obj.get('exp')) - 3600):
        raise Exception('Your credentials have expired, please restart your Jupyter Hub server:'
                        'File>Hub Control Panel, Stop My Server, Start My Server.')
    time_left = int((obj.get('exp') - time.time())/3600)
    return obj.get('sub'), time_left

username, validity_h = getUsername()
hadoopFS = os.environ.get('HADOOP_FS')
namespace = 'iceberg.' + username
sharedNS = 'iceberg.com490_iceberg'

if not re.search('[A-Z][0-9]', groupName):
    raise Exception('Invalid group name {groupName}')

print(f"you are: {username}")
print(f"credentials validity: {validity_h} hours left.")
print(f"shared namespace is: {sharedNS}")
print(f"your namespace is: {namespace}")
print(f"your group is: {groupName}")

trinoAuth = JWTAuthentication(os.environ.get('EPFL_COM490_TOKEN'))
trinoUrl  = urlparse(os.environ.get('TRINO_URL'))
Query=[]

print(f"Warehouse URL: {trinoUrl.scheme}://{trinoUrl.hostname}:{trinoUrl.port}/")

conn = connect(
    host=trinoUrl.hostname,
    port=trinoUrl.port,
    auth=trinoAuth,
    http_scheme=trinoUrl.scheme,
    verify=True
)

print('Connected!')

Imported successfully!
/opt/spark
hadoopFSs=hdfs://iccluster059.iccluster.epfl.ch:9000
username=omanovic
group=U1
you are: omanovic
credentials validity: 160 hours left.
shared namespace is: iceberg.com490_iceberg
your namespace is: iceberg.omanovic
your group is: U1
Warehouse URL: https://iccluster028.iccluster.epfl.ch:8443/
Connected!


25/05/29 22:07:21 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Load model

In [3]:
full_preds = spark.read.parquet(f"{hadoopFS}/user/com-490/group/U1/multiclass_model.parquet") # previous model

                                                                                

In [None]:
full_preds.printSchema()

# Class-wise precision - convert to RDD

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import create_map, lit, col
from itertools import chain
from pyspark.mllib.evaluation import MulticlassMetrics

spark = SparkSession.builder.appName("DelayCategoryMetrics").getOrCreate()

label_list = ["small", "medium", "big"]
entries = list(chain.from_iterable([(lbl, float(idx)) 
                                    for idx, lbl in enumerate(label_list)]))
label_map = create_map(*[lit(x) for x in entries])

eval_df = full_preds \
    .withColumn("label_idx", label_map[col("delay_category")]) \
    .withColumn("pred_idx",  label_map[col("predicted_delay_cat")])

pred_label_rdd = eval_df.select("pred_idx", "label_idx") \
    .rdd.map(lambda row: (row.pred_idx, row.label_idx))

metrics = MulticlassMetrics(pred_label_rdd)

print("Class     Precision  Recall  F1-score")
for idx, lbl in enumerate(label_list):
    p = metrics.precision(float(idx))
    r = metrics.recall(float(idx))
    f1 = 2*p*r/(p+r) if (p + r) > 0 else 0.0
    print(f"{lbl:6s}    {p:9.3f}  {r:6.3f}   {f1:8.3f}")

wp = metrics.weightedPrecision
wr = metrics.weightedRecall
wf1 = 2*wp*wr/(wp+wr) if (wp + wr) > 0 else 0.0
acc = metrics.accuracy

print("\nWeighted Precision: {:.3f}".format(wp))
print("Weighted Recall:    {:.3f}".format(wr))
print("Weighted F1-score:  {:.3f}".format(wf1))
print("Overall Accuracy:   {:.3f}".format(acc))

spark.stop()


25/05/29 22:08:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Class     Precision  Recall  F1-score


                                                                                

small         0.928   0.724      0.813
medium        0.161   0.525      0.247
big           0.130   0.022      0.037

Weighted Precision: 0.840
Weighted Recall:    0.692
Weighted F1-score:  0.759
Overall Accuracy:   0.692


# Metrics overall:

```
Class     Precision  Recall  F1-score
                                                                                
small         0.928   0.724      0.813

medium        0.161   0.525      0.247

big           0.130   0.022      0.037

Weighted Precision: 0.840
Weighted Recall:    0.692
Weighted F1-score:  0.759
Overall Accuracy:   0.692

```

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import when, col

label_to_idx = {lbl: i for i,lbl in enumerate(label_ix.labels)}
evaluator = BinaryClassificationEvaluator(
    rawPredictionCol=None,  
    labelCol="label",
    metricName="areaUnderROC"
)

aucs = {}
for cls in ["small","medium","big"]:
    idx = label_to_idx[cls]
    prob_col = f"prob_{cls}"
    one_vs_rest = full_preds.withColumn(
        "isPos",
        when(col("label") == idx, 1.0).otherwise(0.0)
    )
    evaluator.setRawPredictionCol(prob_col).setLabelCol("isPos")
    auc = evaluator.evaluate(one_vs_rest)
    aucs[cls] = auc

print("One-vs-rest AUCs:", aucs)


In [None]:
pdf = full_preds.select("delay_category_q","predicted_category_q").toPandas()
cm  = pd.crosstab(pdf.delay_category_q, pdf.predicted_category_q,rownames=['True'], colnames=['Pred'], normalize='index')

plt.figure(figsize=(8,8),dpi=1200)
sns.heatmap(cm, annot=True, fmt=".2f", vmin=0, vmax=1)
plt.title("Confusion Matrix")
plt.ylabel('True Category')
plt.xlabel('Predicted Category')
plt.show()
