# Spark and sklearn models

---

S.Yu. Papulin (papulin_bmstu@mail.ru)

### Contents

- [Prediction for DataFrame](#Prediction-for-DataFrame)
    - [Using UDF](#Using-UDF)
    - [Using Pandas UDF](#Using-Pandas-UDF)
- [Model Selection](#Model-Selection)

## Prediction for DataFrame

Sklearn related packages

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn import datasets
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

Spark related packages

In [None]:
import os
import sys

os.environ["SPARK_HOME"] = "/home/ubuntu/BigData/spark"
os.environ["PYSPARK_PYTHON"] = "/home/ubuntu/ML/anaconda3/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/home/ubuntu/ML/anaconda3/bin/python"

spark_home = os.environ.get("SPARK_HOME")
sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
from pyspark.sql.types import (
    FloatType,
    ArrayType,
    StringType
)
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, pandas_udf

### Loading dataset

In [None]:
RANDOM_STATE = 12345

In [None]:
# Load the dataset
housing = datasets.fetch_california_housing()

In [None]:
print(housing["DESCR"])

In [None]:
# Compose features and target variables
X = housing.data[:,:6]
y = housing.target

In [None]:
# Standardize features
X = ((X - X.mean(axis=0)) / X.std(axis=0)).astype("float32")

### Training model

In [None]:
# Divide data into a training set and a testing set
# Note: The testing set will be used as unseen data in a Spark DataFrame
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE
)

In [None]:
# Train the linear model
# Note: Accuracy doesn't matter in this case, 
# we need just a fitted model to demostrate 
# how to use in for Spark DataFrames
model = LinearRegression()
model.fit(X_train, y_train)

### Starting Spark Session

In [None]:
# Configure a Spark cluster
conf = pyspark.SparkConf()\
        .setAppName("sklearnApp")\
        .setMaster("local[*]")

In [None]:
# Run the Spark cluster
spark = SparkSession\
    .builder\
    .config(conf=conf)\
    .getOrCreate()

In [None]:
spark

### Creating Spark RDD

In [None]:
# TODO: use rdd.mapPartitions(predict_price)

### Creating Spark DataFrame

In [None]:
# Create columns names for Pandas DataFrame
CLMNS = housing.feature_names[:6]
CLMNS

In [None]:
# Create a local Pandas DataFrame
pdf = pd.DataFrame(data=X_test, columns=CLMNS)
pdf.head()

In [None]:
# Create a Spark DataFrame from the Pandas one
df = spark.createDataFrame(pdf)
df.show(5)

### Using UDF

In [None]:
# Broadcast the fitted model
model_broadcast = spark.sparkContext.broadcast(model)

In [None]:
@udf(returnType=FloatType())
def predict_price(*row):
    """Predict price for each record"""
    model = model_broadcast.value
    return float(model.predict([row,]))

In [None]:
# Create a new column with predictions
df_predicted = df.withColumn("MEDV_PREDICT", predict_price(*CLMNS))
df_predicted.show(5)

In [None]:
# Take a sample
df_predicted\
    .sample(fraction=0.1)\
    .toPandas()\
    .head()

### Using Pandas UDF

In [None]:
# !~/ML/anaconda3/bin/pip install pyarrow>=0.9.0

In [None]:
@pandas_udf("float", F.PandasUDFType.SCALAR)
def predict_price(*cols):
    """
    Predict price for a batch of rows composed from 
    columns of Pandas Series
    """
    import pandas as pd
    # Create a Pandas DataFrame
    pdf = pd.concat(cols, axis=1, ignore_index=True)
    model = model_broadcast.value
    if not isinstance(model, LinearRegression):    
        raise Exception("Not model.")
    # Predict and return
    return pd.Series(model.predict(pdf)) 

In [None]:
df_predicted = df.withColumn("MEDV_PREDICT", predict_price(*CLMNS))
df_predicted.show(5)

## Model Selection

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold

In [None]:
kfolds = KFold(n_splits=5)

In [None]:
data = {
    "X": X_train,
    "y": y_train,
    "cv": kfolds
}

data_broadcast = spark.sparkContext.broadcast(data)

In [None]:
params_grid = {
    "n_estimators": np.arange(50, 300, 50), 
    "max_depth": np.arange(5, 21, 5),
}

grid = ParameterGrid(params_grid)
list(grid)

In [None]:
# Create a rdd of parameters with 4 partitions
rdd_grid = spark.sparkContext.parallelize(grid, 4)

# Show parameters distributions across the partitions
rdd_grid.glom().collect()

In [None]:
def train(params_set):
    """
    Train model using a subset of paramenters per partition
    and return cross-validation scores.
    """
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.model_selection import cross_val_score
    data = data_broadcast.value
    for params in params_set:
        model = ExtraTreesRegressor(**params)
        cv_results = cross_val_score(model, **data)
        yield cv_results.mean(), params

In [None]:
# Run model selection
rdd_grid\
    .mapPartitions(train)\
    .collect()

### Stopping Spark Session

In [None]:
spark.stop()

## References

- [PySpark Usage Guide for Pandas with Apache Arrow](https://spark.apache.org/docs/2.4.7/sql-pyspark-pandas-with-arrow.html)
- [Introducing Pandas UDF for PySpark](https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html)
- [pyspark.sql.functions.pandas_udf](https://spark.apache.org/docs/2.4.7/api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf)