In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
import tempfile

In [2]:
conf = SparkConf()
conf.set('spark.jars', 'spark-iforest-2.4.0.jar')

<pyspark.conf.SparkConf at 0x7f5b1c28ac90>

In [3]:
spark = SparkSession \
        .builder \
        .config(conf=conf) \
        .appName("IForestExample") \
        .getOrCreate()

In [4]:
from pyspark_iforest.ml.iforest import IForest, IForestModel

In [5]:
temp_path = tempfile.mkdtemp()
iforest_path = temp_path + "/iforest"
model_path = temp_path + "/iforest_model"

In [6]:
# same data as in https://gist.github.com/mkaranasou/7aa1f3a28258330679dcab4277c42419 
# for comparison
data = [
    {'feature1': 1., 'feature2': 0., 'feature3': 0.3, 'feature4': 0.01},
    {'feature1': 10., 'feature2': 3., 'feature3': 0.9, 'feature4': 0.1},
    {'feature1': 101., 'feature2': 13., 'feature3': 0.9, 'feature4': 0.91},
    {'feature1': 111., 'feature2': 11., 'feature3': 1.2, 'feature4': 1.91},
]
# use a VectorAssembler to gather the features as Vectors (dense)
assembler = VectorAssembler(
    inputCols=list(data[0].keys()),
    outputCol="features"
)

df = spark.createDataFrame(data)
df = assembler.transform(df)
df.show()



+--------+--------+--------+--------+--------------------+
|feature1|feature2|feature3|feature4|            features|
+--------+--------+--------+--------+--------------------+
|     1.0|     0.0|     0.3|    0.01|  [1.0,0.0,0.3,0.01]|
|    10.0|     3.0|     0.9|     0.1|  [10.0,3.0,0.9,0.1]|
|   101.0|    13.0|     0.9|    0.91|[101.0,13.0,0.9,0...|
|   111.0|    11.0|     1.2|    1.91|[111.0,11.0,1.2,1...|
+--------+--------+--------+--------+--------------------+



In [7]:
# use a StandardScaler to scale the features (as also done in https://gist.github.com/mkaranasou/7aa1f3a28258330679dcab4277c42419)
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [23]:
iforest = IForest(contamination=0.3, maxDepth=2)

In [9]:
iforest.setSeed(42)  # for reproducibility

IForest_1823c4e322c5

In [10]:
scaler_model = scaler.fit(df)

In [11]:
df = scaler_model.transform(df)

In [12]:
df = df.withColumn('features', F.col('scaledFeatures')).drop('scaledFeatures')

In [13]:
model = iforest.fit(df)

In [14]:
# Check if the model has summary or not, the newly trained model has the summary info
print(model.hasSummary)

True


In [15]:
# Show the number of anomalies
summary = model.summary
print(summary.numAnomalies)

0


In [16]:
# Predict for a new data frame based on the fitted model
transformed = model.transform(df)

In [17]:
# Save the iforest estimator into the path
iforest.save(iforest_path)

In [18]:
# Load iforest estimator from a path
loaded_iforest = IForest.load(iforest_path)

In [19]:
# Save the fitted model into the model path
model.save(model_path)

In [20]:
# Load a fitted model from a model path
loaded_model = IForestModel.load(model_path)

In [21]:
# The loaded model has no summary info
print(loaded_model.hasSummary)

False


In [22]:
# Use the loaded model to predict a new data frame
loaded_model.transform(df).show()

+--------+--------+--------+--------+--------------------+-------------------+----------+
|feature1|feature2|feature3|feature4|            features|       anomalyScore|prediction|
+--------+--------+--------+--------+--------------------+-------------------+----------+
|     1.0|     0.0|     0.3|    0.01|[0.01715764009115...| 0.4891981509209881|       0.0|
|    10.0|     3.0|     0.9|     0.1|[0.17157640091152...|0.40116303198069875|       0.0|
|   101.0|    13.0|     0.9|    0.91|[1.73292164920643...|0.41646474695596675|       0.0|
|   111.0|    11.0|     1.2|    1.91|[1.90449805011796...| 0.4891981509209881|       0.0|
+--------+--------+--------+--------+--------------------+-------------------+----------+

