In [1]:
#CODE CHUNK #1
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ------------------------------------------------------
# Create Spark Session
# ------------------------------------------------------
spark = SparkSession.builder \
    .appName("WeatherPredictor-MLTraining") \
    .getOrCreate()

# Use raw string for Windows path
from pathlib import Path
gold_path = Path("A:/Personal Files/Education/Data Science/Weather_Predictor/data/gold/ml_ready_hourly.parquet")

print("Loading ML-ready dataset...")
df = spark.read.parquet(str(gold_path))

df.printSchema()
df.show(5)


Loading ML-ready dataset...
root
 |-- time: timestamp_ntz (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- relative_humidity_2m: long (nullable = true)
 |-- dew_point_2m: double (nullable = true)
 |-- precipitation_probability: long (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- cloud_cover: long (nullable = true)
 |-- surface_pressure: double (nullable = true)
 |-- wind_speed_10m: double (nullable = true)
 |-- wind_gusts_10m: double (nullable = true)
 |-- wind_direction_10m: long (nullable = true)
 |-- temp_lag_1h: double (nullable = true)
 |-- humidity_lag_1h: double (nullable = true)
 |-- wind_lag_1h: double (nullable = true)
 |-- temp_lag_2h: double (nullable = true)
 |-- humidity_lag_2h: double (nullable = true)
 |-- wind_lag_2h: double (nullable = true)
 |-- temp_lag_3h: double (nullable = true)
 |-- humidity_lag_3h: double (nullable = true)
 |-- wind_lag_3h: double (nullable = true)
 |-- temp_change_1h: double (nullable = true)
 |-- humid

In [2]:
#CODE CHUNK #2
# # Target label
label_col = "rain_next_hour"

# Exclude non-feature columns
exclude = ["time", "date", "rain_next_hour", "rain_next_3h"]

feature_cols = [c for c in df.columns if c not in exclude]

print("Using features:")
print(feature_cols)


Using features:
['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'precipitation_probability', 'precipitation', 'cloud_cover', 'surface_pressure', 'wind_speed_10m', 'wind_gusts_10m', 'wind_direction_10m', 'temp_lag_1h', 'humidity_lag_1h', 'wind_lag_1h', 'temp_lag_2h', 'humidity_lag_2h', 'wind_lag_2h', 'temp_lag_3h', 'humidity_lag_3h', 'wind_lag_3h', 'temp_change_1h', 'humidity_change_1h', 'wind_change_1h']


In [3]:
#CODE CHUNK #3
df = df.dropna()


assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

# Vectorized dataframe
df_ml = assembler.transform(df).select("features", label_col)

train_df, test_df = df_ml.randomSplit([0.8, 0.2], seed=42)

train_df.show(5)


+--------------------+--------------+
|            features|rain_next_hour|
+--------------------+--------------+
|[0.0,89.0,-1.6,0....|             0|
|[0.0,90.0,-1.5,0....|             0|
|[0.3,93.0,-0.7,1....|             0|
|[0.4,91.0,-0.9,1....|             0|
|[0.5,91.0,-0.8,0....|             0|
+--------------------+--------------+
only showing top 5 rows



In [4]:
#CODE CHUNK #4
gbt = GBTClassifier(
    labelCol=label_col,
    featuresCol="features",
    maxDepth=5,
    maxIter=50
)

model = gbt.fit(train_df)

print("Model trained!")


Model trained!


In [5]:
# CODE CHUNK #5
predictions = model.transform(test_df)

predictions.select(
    "prediction",
    "probability",
    label_col
).show(10)

evaluator = BinaryClassificationEvaluator(
    labelCol=label_col,
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print("Spark GBT AUC:", auc)


+----------+--------------------+--------------+
|prediction|         probability|rain_next_hour|
+----------+--------------------+--------------+
|       0.0|[0.97847911444165...|             0|
|       0.0|[0.97847911444165...|             0|
|       0.0|[0.97847911444165...|             0|
|       0.0|[0.97847911444165...|             0|
|       0.0|[0.97847911444165...|             0|
|       0.0|[0.97847911444165...|             0|
|       0.0|[0.97847911444165...|             0|
|       1.0|[0.02152088555834...|             1|
|       0.0|[0.97847911444165...|             0|
|       0.0|[0.97847911444165...|             0|
+----------+--------------------+--------------+
only showing top 10 rows

Spark GBT AUC: 1.0


In [6]:
# CODE CHUNK #6
# --------------------------------------------------
# Export ML-ready data for sklearn / web deployment
# --------------------------------------------------

pdf = df.select(feature_cols + [label_col]).toPandas()

print("Exported dataset shape:", pdf.shape)
pdf.head()


Exported dataset shape: (189, 23)


Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation_probability,precipitation,cloud_cover,surface_pressure,wind_speed_10m,wind_gusts_10m,wind_direction_10m,...,temp_lag_2h,humidity_lag_2h,wind_lag_2h,temp_lag_3h,humidity_lag_3h,wind_lag_3h,temp_change_1h,humidity_change_1h,wind_change_1h,rain_next_hour
0,4.5,91,3.2,0,0.0,48,988.5,1.84,3.9,299,...,5.4,89.0,2.4,6.5,93.0,2.0,-0.8,-2.0,0.24,0
1,3.6,87,1.7,0,0.0,97,988.4,1.25,3.5,299,...,5.3,93.0,1.6,5.4,89.0,2.4,-0.9,-4.0,-0.59,0
2,3.2,91,1.9,0,0.0,100,988.1,1.43,2.1,25,...,4.5,91.0,1.84,5.3,93.0,1.6,-0.4,4.0,0.18,0
3,4.0,97,3.6,0,0.0,100,988.8,2.1,3.7,65,...,3.6,87.0,1.25,4.5,91.0,1.84,0.8,6.0,0.67,0
4,5.0,97,4.6,0,0.0,100,989.9,2.24,7.4,63,...,3.2,91.0,1.43,3.6,87.0,1.25,1.0,0.0,0.14,0


In [8]:
# CODE CHUNK #7
# --------------------------------------------------
# Train deployable sklearn model
# --------------------------------------------------

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

X = pdf[feature_cols]
y = pdf[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

sk_model = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.05,
    random_state=42
)

sk_model.fit(X_train, y_train)

preds = sk_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, preds)

print("Sklearn AUC:", auc)


Sklearn AUC: 0.9739583333333333


In [9]:
# CODE CHUNK #8
# --------------------------------------------------
# Save model for website / API
# --------------------------------------------------

from joblib import dump
from pathlib import Path

model_dir = Path("../models")
model_dir.mkdir(exist_ok=True)

model_path = model_dir / "rain_next_hour_model.joblib"

dump(
    {
        "model": sk_model,
        "features": feature_cols
    },
    model_path
)

print("Model saved to:", model_path.resolve())


Model saved to: A:\Personal Files\Education\Data Science\Weather_Predictor\models\rain_next_hour_model.joblib
