In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("MLflow").getOrCreate()

24/08/20 09:57:50 WARN Utils: Your hostname, Zipcoders-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 192.168.3.169 instead (on interface en0)
24/08/20 09:57:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/20 09:57:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/20 09:57:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [7]:

filePath = "/Users/qian/Desktop/Zipcode/Python/LearningSparkV2/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet/"
airbnbDF = spark.read.parquet(filePath)
(trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42)
categoricalCols = [field for (field, dataType) in trainDF.dtypes
if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
stringIndexer = StringIndexer(inputCols=categoricalCols,
outputCols=indexOutputCols,
handleInvalid="skip")
numericCols = [field for (field, dataType) in trainDF.dtypes
if ((dataType == "double") & (field != "price"))]
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs,
outputCol="features")
rf = RandomForestRegressor(labelCol="price", maxBins=40, maxDepth=5,
numTrees=100, seed=42)
pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

In [9]:
import mlflow
import mlflow.spark
import pandas as pd

In [15]:
with mlflow.start_run(run_name="random-forest") as run:
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())

    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(),rfModel.featureImportances)),columns=["feature", "importance"]).sort_values(by="importance", ascending=False))
    
    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("feature-importance.csv", index=False)
    mlflow.log_artifact("feature-importance.csv")



In [13]:
mlflow.end_run()

In [16]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
runs = client.search_runs(run.info.experiment_id,order_by=["attributes.start_time desc"],max_results=1)
run_id = runs[0].info.run_id
runs[0].data.metrics

{'r2': 0.22794251914574226, 'rmse': 211.5096898777315}

In [18]:
mlflow.run("https://github.com/databricks/LearningSparkV2/#mlflow-project-example",parameters={"max_depth": 5, "num_trees": 100})

2024/08/20 10:11:50 INFO mlflow.projects.utils: === Fetching project from https://github.com/databricks/LearningSparkV2/#mlflow-project-example into /var/folders/zm/d0465cgn1zvdzj96xmp0pxww0000gp/T/tmp43p2dgwn ===
2024/08/20 10:12:00 INFO mlflow.projects.utils: Fetched 'master' branch
2024/08/20 10:12:02 INFO mlflow.utils.conda: === Creating conda environment mlflow-a91eb9b529409372aa4585b19c73952959a7a296 ===


Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed



PackagesNotFoundError: The following packages are not available from current channels:

  - python=3.7*
  - pip=19.0.3*
  - pandas=0.24*

Current channels:

  - https://conda.anaconda.org/conda-forge/osx-arm64
  - https://repo.anaconda.com/pkgs/main/osx-arm64
  - https://repo.anaconda.com/pkgs/r/osx-arm64

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




ShellCommandException: Non-zero exit code: 1
Command: ['conda', 'env', 'create', '-n', 'mlflow-a91eb9b529409372aa4585b19c73952959a7a296', '--file', '/var/folders/zm/d0465cgn1zvdzj96xmp0pxww0000gp/T/tmp43p2dgwn/mlflow-project-example/conda.yaml', '--quiet']

In [20]:
# Load saved model with MLflow
import mlflow.spark
pipelineModel = mlflow.spark.load_model(f"runs:/{run_id}/model")

# Generate predictions
inputDF = spark.read.parquet("/Users/qian/Desktop/Zipcode/Python/LearningSparkV2/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet/")

predDF = pipelineModel.transform(inputDF)

2024/08/20 10:17:55 INFO mlflow.spark: 'runs:/14bc0371c5074b6dae3e1ca166927820/model' resolved as 'file:///Users/qian/Desktop/ZipCode/Python/SparkML/mlruns/0/14bc0371c5074b6dae3e1ca166927820/artifacts/model'
2024/08/20 10:17:55 INFO mlflow.spark: URI 'runs:/14bc0371c5074b6dae3e1ca166927820/model/sparkml' does not point to the current DFS.
2024/08/20 10:17:55 INFO mlflow.spark: File 'runs:/14bc0371c5074b6dae3e1ca166927820/model/sparkml' not found on DFS. Will attempt to upload the file.


In [None]:
!git add .
!git commit -m '12'
!git 