In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
import tempfile
import pandas as pd

In [2]:
conf = SparkConf()
conf.set('spark.jars', 'spark-iforest-2.4.0.jar')

<pyspark.conf.SparkConf at 0x7fe91e727290>

In [3]:
spark = SparkSession \
        .builder \
        .config(conf=conf) \
        .appName("IForestExampleForestCover") \
        .getOrCreate()

In [4]:
from pyspark_iforest.ml.iforest import IForest, IForestModel

In [5]:
temp_path = tempfile.mkdtemp()
iforest_path = temp_path + "/iforest"
model_path = temp_path + "/iforest_model"

In [6]:
dataset_size = 286048
rawData = spark.read.format("csv").option("comment", "#").option("header", "true").option("inferSchema", "true").load("ForestCover.csv")

In [7]:
rawData

DataFrame[0: int, 1: int, 2: int, 3: int, 4: int, 5: int, 6: int, 7: int, 8: int, 9: int, 10: int]

In [8]:
cols = rawData.columns
labelCol = cols[len(cols)-1]
cols

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [9]:
labelCol

'10'

In [10]:
used_cols = []
i = 0
for col in cols:
    if i < len(cols)-1:
        used_cols.append(col)
        i = i+1
used_cols

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [11]:
assembler = VectorAssembler(
    inputCols=list(used_cols),
    outputCol="features"
)

In [12]:
df = assembler.transform(rawData)
df.show()

+----+---+---+---+---+----+---+---+---+----+---+--------------------+
|   0|  1|  2|  3|  4|   5|  6|  7|  8|   9| 10|            features|
+----+---+---+---+---+----+---+---+---+----+---+--------------------+
|2804|139|  9|268| 65|3180|234|238|135|6121|  0|[2804.0,139.0,9.0...|
|2785|155| 18|242|118|3090|238|238|122|6211|  0|[2785.0,155.0,18....|
|2579|132|  6|300|-15|  67|230|237|140|6031|  0|[2579.0,132.0,6.0...|
|2886|151| 11|371| 26|5253|234|240|136|4051|  0|[2886.0,151.0,11....|
|2742|134| 22|150| 69|3215|248|224| 92|6091|  0|[2742.0,134.0,22....|
|2880|209| 17|216| 30|4986|206|253|179|4323|  0|[2880.0,209.0,17....|
|2962|148| 16|323| 23|5916|240|236|120|3395|  0|[2962.0,148.0,16....|
|2811|135|  1|212| 30|3670|220|238|154|5643|  0|[2811.0,135.0,1.0...|
|2900| 45| 19|242| 20|5199|221|195|100|4115|  0|[2900.0,45.0,19.0...|
|2570|346|  2|  0|  0| 331|215|235|158|5745|  0|[2570.0,346.0,2.0...|
|2678|128|  5| 95| 23|1660|229|236|141|6546|  0|[2678.0,128.0,5.0...|
|2952|107| 11| 42|  

In [13]:
# use a StandardScaler to scale the features (as also done in https://gist.github.com/mkaranasou/7aa1f3a28258330679dcab4277c42419)
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [14]:
#iforest = IForest(contamination=0.1, maxDepth=2)
contamination = 0.1
contaminationError = 0.01 * contamination
iforest = IForest(numTrees=100, maxSamples=256, maxDepth=8)
#.setNumEstimators(100).setBootstrap(false).setMaxSamples(256).setMaxFeatures(1.0).setFeaturesCol("features").setPredictionCol("predictedLabel").setScoreCol("outlierScore").setContamination(contamination)

In [15]:
iforest.setSeed(42)  # for reproducibility

IForest_b843a70af27a

In [16]:
scaler_model = scaler.fit(df)

In [17]:
df = scaler_model.transform(df)

In [18]:
df = df.withColumn('features', F.col('scaledFeatures')).drop('scaledFeatures')

In [19]:
model = iforest.fit(df)

In [20]:
# Check if the model has summary or not, the newly trained model has the summary info
print(model.hasSummary)

True


In [21]:
# Show the number of anomalies
summary = model.summary
print(summary.numAnomalies)

28604


In [22]:
summary.numAnomalies*100/dataset_size

9.999720326658464

In [23]:
# Predict for a new data frame based on the fitted model
transformed = model.transform(df)
#transformed.show()

In [24]:
# Save the iforest estimator into the path
iforest.save(iforest_path)

In [25]:
# Load iforest estimator from a path
loaded_iforest = IForest.load(iforest_path)

In [26]:
# Save the fitted model into the model path
model.save(model_path)

In [27]:
# Load a fitted model from a model path
loaded_model = IForestModel.load(model_path)

In [28]:
# The loaded model has no summary info
print(loaded_model.hasSummary)

False


In [29]:
# Use the loaded model to predict a new data frame
loaded_model.transform(df).show()

+----+---+---+---+---+----+---+---+---+----+---+--------------------+-------------------+----------+
|   0|  1|  2|  3|  4|   5|  6|  7|  8|   9| 10|            features|       anomalyScore|prediction|
+----+---+---+---+---+----+---+---+---+----+---+--------------------+-------------------+----------+
|2804|139|  9|268| 65|3180|234|238|135|6121|  0|[14.1625228148368...| 0.4612702105186938|       0.0|
|2785|155| 18|242|118|3090|238|238|122|6211|  0|[14.0665570753639...| 0.4994219279590085|       0.0|
|2579|132|  6|300|-15|  67|230|237|140|6031|  0|[13.0260864263424...|  0.525708556193318|       1.0|
|2886|151| 11|371| 26|5253|234|240|136|4051|  0|[14.5766907430881...|0.44653748057589965|       0.0|
|2742|134| 22|150| 69|3215|248|224| 92|6091|  0|[13.8493714544517...| 0.5082939089654905|       0.0|
|2880|209| 17|216| 30|4986|206|253|179|4323|  0|[14.5463857727282...|0.46563153491524106|       0.0|
|2962|148| 16|323| 23|5916|240|236|120|3395|  0|[14.9605537009795...|0.45170447872848263|  