# Spark + sklearn

---

S.Yu. Papulin (papulin_bmstu@mail.ru)

### Contents

- [Prediction for DataFrame](#Prediction-for-DataFrame)
    - [Using UDF](#Using-UDF)
    - [Using Pandas UDF](#Using-Pandas-UDF)
- [Model Selection](#Model-Selection)

## Prediction for DataFrame

Sklearn related packages

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn import datasets
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

Spark related packages

In [None]:
import os
import sys

os.environ["SPARK_HOME"] = "/home/ubuntu/BigData/spark"
os.environ["PYSPARK_PYTHON"] = "/home/ubuntu/ML/anaconda3/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/home/ubuntu/ML/anaconda3/bin/python"

spark_home = os.environ.get("SPARK_HOME")
sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
from pyspark.sql.types import (
    FloatType,
    ArrayType,
    StringType
)
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, pandas_udf

### Loading dataset

In [None]:
RANDOM_STATE = 12345

In [None]:
# Загрузка исходных данных
housing = datasets.fetch_california_housing()

In [None]:
print(housing["DESCR"])

In [None]:
X = housing.data[:,:6]
y = housing.target

In [None]:
CLMS = housing.feature_names[:6] + ["MEDV"]
CLMS

In [None]:
# Нормализация признаков
X = ((X - X.mean(axis=0)) / X.std(axis=0)).astype("float32")

### Training model

In [None]:
# Разделение данных на обучающее и тестовое множества
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
# mae_linear = mean_absolute_error(y_test, model.predict(X_test))
# print("MAE =", mae_linear)

### Starting Spark Session

In [None]:
conf = pyspark.SparkConf()\
        .setAppName("bostonApp")\
        .setMaster("local[*]")

In [None]:
spark = SparkSession\
    .builder\
    .appName("bostonApp")\
    .config(conf=conf)\
    .getOrCreate()

In [None]:
spark

### Creating Spark DataFrame

In [None]:
CLMS = housing.feature_names[:6] + ["MEDV"]
pdf = pd.DataFrame(data=np.c_[X_test, y_test], columns=CLMS)
pdf.head()

In [None]:
df = spark.createDataFrame(pdf)
df.show(5)

In [None]:
model_broadcast = spark.sparkContext.broadcast(model)

### Using UDF

In [None]:
@udf(returnType=FloatType())
def predict_price(*row):
    model = model_broadcast.value
    return float(model.predict([row,]))

In [None]:
CLMNS_ = CLMS.copy()
CLMNS_.remove("MEDV")
clmns = [F.col(clmn) for clmn in CLMNS_]

In [None]:
df_predicted = df.withColumn("MEDV_PREDICT", predict_price(*clmns))
df_predicted.show(5)

In [None]:
pdf_predicted = df_predicted.toPandas()
pdf_predicted.head()

In [None]:
# mae_linear = mean_absolute_error(pdf_predicted["MEDV"], pdf_predicted["MEDV_PREDICT"])
# print("MAE =", mae_linear)

### Using Pandas UDF

In [None]:
#  ~/ML/anaconda3/bin/pip install pyarrow>=0.9.0

In [None]:
@pandas_udf("float", F.PandasUDFType.SCALAR)
def predict_price_(*cols):
    import pandas as pd
    pdf = pd.concat(cols, axis=1, ignore_index=True)
    model = model_broadcast.value
    if not isinstance(model, LinearRegression):    
        raise Exception("Not model.")
    return pd.Series(model.predict(pdf)) 

In [None]:
CLMNS_ = CLMS.copy()
CLMNS_.remove("MEDV")
clmns = [F.col(clmn) for clmn in CLMNS_]
clmns = [clmn for clmn in CLMNS_]
clmns

In [None]:
df_predicted = df.withColumn("MEDV_PREDICT", predict_price_(*clmns))
df_predicted.show(5)

## Model Selection

In [None]:
from sklearn.model_selection import ParameterGrid

In [None]:
from sklearn.linear_model import LinearRegression, Ridge

In [None]:
data_broadcast = spark.sparkContext.broadcast((X_train, y_train))

In [None]:
params_grid = {
    "n_estimators": np.arange(50, 300, 50), 
    "max_depth": np.arange(5, 21, 5),
}

grid = ParameterGrid(params_grid)
list(grid)

In [None]:
rdd_grid = spark.sparkContext.parallelize(grid, 4)
rdd_grid.glom().collect()

In [None]:
def train(params_set):
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.model_selection import train_test_split
    X, y = data_broadcast.value
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
    for params in params_set:
        model = ExtraTreesRegressor(**params).fit(X_train, y_train)
        yield model.score(X_test, y_test), params

In [None]:
rdd_grid\
    .mapPartitions(train)\
    .collect()

### Stopping Spark Session

In [None]:
spark.stop()

## References

In [None]:
https://spark.apache.org/docs/2.4.7/api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf
https://spark.apache.org/docs/2.4.7/sql-pyspark-pandas-with-arrow.html
https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html