In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS presentation")

DataFrame[]

In [0]:
df_master = spark.sql("""
    SELECT 
        -- Keys for joining/tracking
        f.icao24,
        f.callsign,
        f.request_time_utc,
        w.observation_time as weather_time,
        
        -- Features for Machine Learning
        f.origin_country,
        COALESCE(f.baro_altitude, 0) as baro_altitude,
        f.velocity,
        f.true_track as plane_heading,
        w.temperature_c,
        w.wind_speed_kmh,
        w.wind_direction,
        w.weather_code,
        
        -- ENGINEERED FEATURE: Headwind/Crosswind Calculation
        -- Im calculating the angle difference between plane and wind
        -- 0 = Tailwind (Wind behind you), 180 = Headwind (Wind in face)
        ABS(f.true_track - w.wind_direction) as wind_offset_angle
        
    FROM silver.flights_parsed f
    INNER JOIN silver.weather_parsed w
    ON 
        -- Spatial Match: 1 decimal place ~= 10km grid
        ROUND(f.latitude, 0) = ROUND(w.latitude, 0)
        AND 
        ROUND(f.longitude, 0) = ROUND(w.longitude, 0)
        
    WHERE 
        -- Temporal Match: Weather must be within 90 minutes (5400 sec)
        ABS(unix_timestamp(f.request_time_utc) - unix_timestamp(w.observation_time)) <= 14400
""")

In [0]:
df_master.display()

icao24,callsign,request_time_utc,weather_time,origin_country,baro_altitude,velocity,plane_heading,temperature_c,wind_speed_kmh,wind_direction,weather_code,wind_offset_angle
7c35f2,KXW,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,457.2,50.05,12.46,21.5,17.7,213,0,200.54
7c6b37,JST885,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,0.0,0.06,351.56,28.5,8.6,165,0,186.56
7c6b3b,JST890,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,0.0,3.09,165.94,20.5,15.8,135,80,30.94
7c7cd2,JST774,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,0.0,1.54,351.56,28.5,8.6,165,0,186.56
7c7cce,JST844,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,3596.64,192.93,275.97,20.5,15.8,135,80,140.97000000000003
7c7ca8,ASTR417,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,944.88,59.24,203.0,28.5,8.6,165,0,38.0
7c7ca0,ASTR406,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,1074.42,50.47,29.96,28.5,8.6,165,0,135.04
7c7ca9,ASTR436,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,624.84,55.24,32.06,28.5,8.6,165,0,132.94
7c7cf7,YYX,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,0.0,5.66,36.56,35.7,7.4,317,2,280.44
7c6c97,,2026-01-18 01:46:28,2026-01-18T03:45:00Z,Australia,0.0,6.43,281.25,20.5,15.8,135,80,146.25


In [0]:
(df_master.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("presentation.flight_weather_master")
)

Ignore the below code, was used when vector assembeler was'nt working on shared compute

In [0]:
# # Train Model (Scikit-Learn)
# # ---------------------------------------------------------
# import pandas as pd
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
# import numpy as np

# # 1. Load Data
# df_spark = spark.table("presentation.flight_weather_master")

# # Fill nulls to prevent errors
# df_spark = df_spark.na.fill(0, subset=["baro_altitude", "velocity", "temperature_c", "wind_speed_kmh", "wind_offset_angle"])

# # Convert to Pandas (Pull to driver)
# pdf = df_spark.select("baro_altitude", "temperature_c", "wind_speed_kmh", "wind_offset_angle", "velocity").toPandas()

# # 2. Define X and y
# X = pdf[["baro_altitude", "temperature_c", "wind_speed_kmh", "wind_offset_angle"]]
# y = pdf["velocity"]

# # 3. Split & Train
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model = LinearRegression()
# model.fit(X_train, y_train)

# # 4. Evaluate
# predictions = model.predict(X_test)
# rmse = np.sqrt(mean_squared_error(y_test, predictions))

# print(f"Model Trained! RMSE: {rmse:.2f} m/s")
# print(f"Intercept: {model.intercept_:.2f}")
# print("Coefficients:", dict(zip(X.columns, model.coef_)))

🏃 View run selective-robin-346 at: https://adb-7405610778962489.9.azuredatabricks.net/ml/experiments/3246080758235222/runs/573f43d369b847d9bb42e99f1b35c9c3
🧪 View experiment at: https://adb-7405610778962489.9.azuredatabricks.net/ml/experiments/3246080758235222
Model Trained! RMSE: 32.51 m/s
Intercept: 55.45
Coefficients: {'baro_altitude': 0.029007838817224112, 'temperature_c': -0.44249279720176266, 'wind_speed_kmh': -0.9272080939868992, 'wind_offset_angle': -0.04407098079859996}


In [0]:
# Train Model (Spark MLlib)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Prepare Data
# Spark ML requires all input features to be in a single vector column
feature_cols = ["baro_altitude", "temperature_c", "wind_speed_kmh", "wind_offset_angle"]

# Handle Nulls: Spark ML will crash if there are nulls in features
df_gold = spark.table("presentation.flight_weather_master")
df_clean = df_gold.na.fill(0, subset=feature_cols + ["velocity"])

# Create the Vector Assembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_ml_ready = assembler.transform(df_clean)

# 2. Split Data (80% Training, 20% Testing)
train_data, test_data = df_ml_ready.randomSplit([0.8, 0.2], seed=42)

# 3. Train the Model
lr = LinearRegression(featuresCol="features", labelCol="velocity")
lr_model = lr.fit(train_data)

# 4. Evaluate
predictions = lr_model.transform(test_data)
evaluator = RegressionEvaluator(labelCol="velocity", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

print(f"Spark ML Model Trained! RMSE: {rmse:.2f} m/s")
print(f"Intercept: {lr_model.intercept:.2f}")
print("Coefficients:")
for col_name, coef in zip(feature_cols, lr_model.coefficients):
    print(f"  - {col_name}: {coef:.4f}")

🏃 View run sedate-gull-984 at: https://adb-7405610778962489.9.azuredatabricks.net/ml/experiments/3246080758235222/runs/81e961a368594396b50a89c09b03529a
🧪 View experiment at: https://adb-7405610778962489.9.azuredatabricks.net/ml/experiments/3246080758235222
Spark ML Model Trained! RMSE: 33.31 m/s
Intercept: 29.33
Coefficients:
  - baro_altitude: 0.0294
  - temperature_c: 0.3664
  - wind_speed_kmh: -0.4669
  - wind_offset_angle: -0.0527


In [0]:
# Score Data & Save Prediction Table
from pyspark.sql.functions import col, expr

# 1. Run Inference on the FULL dataset
# We reuse the 'df_ml_ready' dataframe from above (or re-transform the master)
full_data_w_features = assembler.transform(df_clean)

# The model adds a column called 'prediction'
df_scored = lr_model.transform(full_data_w_features)

# 2. Calculate Efficiency Score
# Spark SQL math: (Actual / Predicted) * 100
df_final = df_scored.withColumn(
    "efficiency_score", 
    expr("(velocity / prediction) * 100")
)

# Handle potential divide-by-zero or nulls
df_final = df_final.na.fill(0, subset=["efficiency_score"])

# 3. Select final columns and Save
# We select only what the Web App needs
df_presentation = df_final.select(
    "icao24", "callsign", "origin_country", 
    "baro_altitude", "velocity", 
    col("prediction").alias("predicted_velocity"),
    "efficiency_score",
    "request_time_utc"
)

(df_presentation.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("presentation.flight_predictions")
)

print("Enriched Predictions Saved to 'presentation.flight_predictions'")

Enriched Predictions Saved to 'presentation.flight_predictions'
