In [0]:
gold_country_df = spark.table("gold_climate_country_indicators")


In [0]:
ml_df = gold_country_df.select(
    "year",
    "avg_yearly_temperature",
    "historical_avg_temperature",
    "temperature_anomaly",
    "high_climate_risk"
)


In [0]:
ml_df = ml_df.dropna()


In [0]:
train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[
        "year",
        "avg_yearly_temperature",
        "historical_avg_temperature",
        "temperature_anomaly"
    ],
    outputCol="features"
)

train_data = assembler.transform(train_df)
test_data = assembler.transform(test_df)


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="high_climate_risk"
)

lr_model = lr.fit(train_data)


In [0]:
predictions = lr_model.transform(test_data)


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="high_climate_risk",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
auc


0.9894846200095516

In [0]:
predictions.groupBy(
    "high_climate_risk", "prediction"
).count().show()


+-----------------+----------+-----+
|high_climate_risk|prediction|count|
+-----------------+----------+-----+
|                0|       1.0|  129|
|                1|       0.0|  197|
|                1|       1.0| 1066|
|                0|       0.0| 7850|
+-----------------+----------+-----+



In [0]:
lr_model.coefficients


DenseVector([0.0086, 0.1387, 0.0695, 11.3299])

In [0]:
import mlflow
import mlflow.spark

mlflow.set_experiment("/Shared/climate_risk_country_model")


2026/01/29 07:48:27 INFO mlflow.tracking.fluent: Experiment with name '/Shared/climate_risk_country_model' does not exist. Creating a new experiment.


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/1930267809943715', creation_time=1769672907629, experiment_id='1930267809943715', last_update_time=1769672907629, lifecycle_stage='active', name='/Shared/climate_risk_country_model', tags={'mlflow.experiment.sourceName': '/Shared/climate_risk_country_model',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'abhayshinde6754@gmail.com',
 'mlflow.ownerId': '73261402331391'}>

In [0]:
import os

os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/default/climate_raw/mlflow_tmp"


In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

mlflow.set_experiment("/Shared/climate_risk_country_model")

with mlflow.start_run():

    lr = LogisticRegression(
        featuresCol="features",
        labelCol="high_climate_risk"
    )

    lr_model = lr.fit(train_data)

    predictions = lr_model.transform(test_data)

    evaluator = BinaryClassificationEvaluator(
        labelCol="high_climate_risk",
        metricName="areaUnderROC"
    )

    auc = evaluator.evaluate(predictions)

    mlflow.log_metric("auc", auc)

    mlflow.spark.log_model(
        lr_model,
        artifact_path="logistic_regression_model"
    )




In [0]:
gold_country_df = spark.table("gold_climate_country_indicators")


In [0]:
ml_df = gold_country_df.select(
    "year",
    "avg_yearly_temperature",
    "historical_avg_temperature",
    "temperature_anomaly",
    "high_climate_risk"
).dropna()


In [0]:
train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[
        "year",
        "avg_yearly_temperature",
        "historical_avg_temperature",
        "temperature_anomaly"
    ],
    outputCol="features"
)

train_data = assembler.transform(train_df)
test_data = assembler.transform(test_df)


In [0]:
import mlflow.spark

model_uri = "models:/logistic_regression_model/latest"
lr_model = mlflow.spark.load_model(model_uri)


[0;31m---------------------------------------------------------------------------[0m
[0;31mMlflowException[0m                           Traceback (most recent call last)
File [0;32m<command-7647533210658436>, line 4[0m
[1;32m      1[0m [38;5;28;01mimport[39;00m [38;5;21;01mmlflow[39;00m[38;5;21;01m.[39;00m[38;5;21;01mspark[39;00m
[1;32m      3[0m model_uri [38;5;241m=[39m [38;5;124m"[39m[38;5;124mmodels:/logistic_regression_model/latest[39m[38;5;124m"[39m
[0;32m----> 4[0m lr_model [38;5;241m=[39m mlflow[38;5;241m.[39mspark[38;5;241m.[39mload_model(model_uri)

File [0;32m/databricks/python/lib/python3.12/site-packages/mlflow/spark/__init__.py:961[0m, in [0;36mload_model[0;34m(model_uri, dfs_tmpdir, dst_path)[0m
[1;32m    957[0m [38;5;66;03m# This MUST be called prior to appending the model flavor to `model_uri` in order[39;00m
[1;32m    958[0m [38;5;66;03m# for `artifact_path` to take on the correct value for model loading via mlflowdbfs.[3

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="high_climate_risk"
)
lr_model = lr.fit(train_data)


In [0]:
predictions = lr_model.transform(test_data)


In [0]:
prediction_df = predictions.select(
    "year",
    "avg_yearly_temperature",
    "historical_avg_temperature",
    "temperature_anomaly",
    "high_climate_risk",
    "prediction",
    "probability"
)


In [0]:
(
    prediction_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable("gold_country_climate_predictions")
)


In [0]:
spark.sql(
    "SELECT * FROM gold_country_climate_predictions LIMIT 5"
).show(truncate=False)


+----+----------------------+--------------------------+-------------------+-----------------+----------+-----------+
|year|avg_yearly_temperature|historical_avg_temperature|temperature_anomaly|high_climate_risk|prediction|probability|
+----+----------------------+--------------------------+-------------------+-----------------+----------+-----------+
|1743|1.3230000000000002    |4.611730053342408         |-3.288730053342408 |0                |0.0       |[1.0,0.0]  |
|1743|2.4819999999999998    |6.1774884661218925        |-3.6954884661218927|0                |0.0       |[1.0,0.0]  |
|1743|3.572                 |6.96089656395415          |-3.38889656395415  |0                |0.0       |[1.0,0.0]  |
|1743|5.096                 |8.992271734195894         |-3.896271734195894 |0                |0.0       |[1.0,0.0]  |
|1743|5.431                 |10.33786160764953         |-4.906861607649531 |0                |0.0       |[1.0,0.0]  |
+----+----------------------+--------------------------+