In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
import tempfile
import pandas as pd

In [2]:
conf = SparkConf()
conf.set('spark.jars', 'spark-iforest-2.4.0.jar')

<pyspark.conf.SparkConf at 0x7fa3190110d0>

In [3]:
spark = SparkSession \
        .builder \
        .config(conf=conf) \
        .appName("IForestExampleSMTP") \
        .getOrCreate()

In [4]:
from pyspark_iforest.ml.iforest import IForest, IForestModel

In [5]:
temp_path = tempfile.mkdtemp()
iforest_path = temp_path + "/iforest"
model_path = temp_path + "/iforest_model"

In [6]:
dataset_size = 95156
rawData = spark.read.format("csv").option("comment", "#").option("header", "true").option("inferSchema", "true").load("SMTP.csv")

In [7]:
rawData

DataFrame[0: int, 1: int, 2: int, 3: int, 4: int, 5: int, 6: int, 7: int, 8: int, 9: int]

In [8]:
cols = rawData.columns
labelCol = cols[len(cols)-1]
cols

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [9]:
labelCol

'9'

In [10]:
used_cols = []
i = 0
for col in cols:
    if i < len(cols)-1:
        used_cols.append(col)
        i = i+1
used_cols

['0', '1', '2', '3', '4', '5', '6', '7', '8']

In [11]:
assembler = VectorAssembler(
    inputCols=list(used_cols),
    outputCol="features"
)

In [12]:
df = assembler.transform(rawData)
df.show()

+---+---+---+---+---+---+---+---+---+---+--------------------+
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|            features|
+---+---+---+---+---+---+---+---+---+---+--------------------+
| 38|  1| 96|  0| 38| 13| 58| 57|  0|  0|[38.0,1.0,96.0,0....|
| 37|  0| 77|  0| 24| 25| 40| 54| 14|  0|[37.0,0.0,77.0,0....|
| 45|  0| 83|  0| 44|-17| 38| 39|  2|  0|[45.0,0.0,83.0,0....|
| 56| -1| 84|  0| 54|-30| 28| 30|  2|  0|[56.0,-1.0,84.0,0...|
| 55|  5| 77|  0| 54|  0| 23| 23|  0|  0|[55.0,5.0,77.0,0....|
| 37|  0| 79|  0|  8|  9| 43| 72| 28|  0|[37.0,0.0,79.0,0....|
| 81|  4| 84|  0|-20|  0|  4|105|102|  1|[81.0,4.0,84.0,0....|
| 41|  5| 93|  0| 38|-14| 52| 55|  2|  0|[41.0,5.0,93.0,0....|
| 38| -2| 80|  0| 38|  0| 42| 41|  0|  0|[38.0,-2.0,80.0,0...|
| 37| -1| 80| -1| 36|  0| 43| 44|  0|  0|[37.0,-1.0,80.0,-...|
| 50|  0| 86|  6| 50|  0| 36| 37|  0|  0|[50.0,0.0,86.0,6....|
| 37|  0| 77|  2| 36|  4| 40| 41|  0|  0|[37.0,0.0,77.0,2....|
| 45|  0| 82|  0| 44|  0| 37| 38|  0|  0|[45.0,0.0,82.0

In [13]:
# use a StandardScaler to scale the features (as also done in https://gist.github.com/mkaranasou/7aa1f3a28258330679dcab4277c42419)
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [14]:
#iforest = IForest(contamination=0.1, maxDepth=2)
contamination = 0.1
contaminationError = 0.01 * contamination
iforest = IForest(numTrees=100, maxSamples=256, maxDepth=8)
#.setNumEstimators(100).setBootstrap(false).setMaxSamples(256).setMaxFeatures(1.0).setFeaturesCol("features").setPredictionCol("predictedLabel").setScoreCol("outlierScore").setContamination(contamination)

In [15]:
iforest.setSeed(42)  # for reproducibility

IForest_ceb202e5666f

In [16]:
scaler_model = scaler.fit(df)

In [17]:
df = scaler_model.transform(df)

In [18]:
df = df.withColumn('features', F.col('scaledFeatures')).drop('scaledFeatures')

In [19]:
model = iforest.fit(df)

In [20]:
# Check if the model has summary or not, the newly trained model has the summary info
print(model.hasSummary)

True


In [21]:
# Show the number of anomalies
summary = model.summary
print(summary.numAnomalies)

4909


In [None]:
summary.numAnomalies*100/dataset_size

In [22]:
# Predict for a new data frame based on the fitted model
transformed = model.transform(df)
#transformed.show()

In [23]:
# Save the iforest estimator into the path
iforest.save(iforest_path)

In [24]:
# Load iforest estimator from a path
loaded_iforest = IForest.load(iforest_path)

In [25]:
# Save the fitted model into the model path
model.save(model_path)

In [26]:
# Load a fitted model from a model path
loaded_model = IForestModel.load(model_path)

In [27]:
# The loaded model has no summary info
print(loaded_model.hasSummary)

False


In [28]:
# Use the loaded model to predict a new data frame
loaded_model.transform(df).show()

+---+---+---+---+---+---+---+---+---+---+--------------------+-------------------+----------+
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|            features|       anomalyScore|prediction|
+---+---+---+---+---+---+---+---+---+---+--------------------+-------------------+----------+
| 38|  1| 96|  0| 38| 13| 58| 57|  0|  0|[2.95141977229244...| 0.4207576080079343|       0.0|
| 37|  0| 77|  0| 24| 25| 40| 54| 14|  0|[2.87375083091633...|0.41866881708949394|       0.0|
| 45|  0| 83|  0| 44|-17| 38| 39|  2|  0|[3.49510236192526...| 0.3594062769663785|       0.0|
| 56| -1| 84|  0| 54|-30| 28| 30|  2|  0|[4.34946071706255...| 0.4591965517994283|       0.0|
| 55|  5| 77|  0| 54|  0| 23| 23|  0|  0|[4.27179177568643...| 0.4588309379213996|       0.0|
| 37|  0| 79|  0|  8|  9| 43| 72| 28|  0|[2.87375083091633...|0.43590928773042875|       0.0|
| 81|  4| 84|  0|-20|  0|  4|105|102|  1|[6.29118425146548...| 0.6070235793161712|       1.0|
| 41|  5| 93|  0| 38|-14| 52| 55|  2|  0|[3.18442659642079..