This notebook is used for training anomalous resource access model.  The data used here is File Share Access Events from Windows machine. Data is loaded from a Blob Storage Container.
The trained model is then saved to the Blob Storage, which can then be used by the Scoring Notebook

Steps:
   0. One-time: Install the following packages on your cluster by navigating to the 'Clusters' tab on the left
        - sentinel_ai (whl package from GitHub Utilities folder)
        - azure_sentinel_ml_utilities (whl package from GitHub Utilities folder)
        - azure-storage-blob (from PyPi)
        - scikit-surprise (from PyPi)
        - numpy==1.15.0 (from PyPi)
        - pyarrow==0.12.0 (from PyPi)
        
   1. One-time: Set credentials in KeyVault so the notebook can access 
        - Storage Account
   2. Ensure the relative paths to Blob Storage are correct.
   3. Run the Notebook to produce the model
   
 One-time: (Setting up Storage Key in KeyVault)
    - (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#access-azure-blob-storage-directly)
 
 Storing and retrieving secrets: 
    - Using Azure KeyVault:- https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#akv-ss

In [2]:
import datetime as dt

# Storage Account information
storage_account = 'YOUR STORAGE ACCOUNT HERE'
storage_key = dbutils.secrets.get(scope = 'NAME HERE', key = 'KEY NAME HERE')
container = 'CONTAINER NAME HERE'
mount_point_name = 'MOUNT POINT NAME HERE'

train_base_path = 'PATHNAME HERE'

# Project name
project = 'PROJECT NAME HERE'

###
### Note that when training periodically, you specify time range relative to current time as specified in the commented section
###
# Time range for training
# train_start_time = dt.datetime.now() - dt.timedelta(days=65)
# train_end_time = dt.datetime.now() - dt.timedelta(days=10)
train_start_time = dt.datetime.strptime('Dec 1 2018', '%b %d %Y') 
train_end_time = dt.datetime.strptime('Jan 20 2019', '%b %d %Y') 

In [3]:
###
### You can do this one-time in a separate Notebook, so that you don't cause accidental errors in other Notebooks mounting/unmounting the folder
###

# Mount the Storage Container
#    (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#mount-azure-blob-storage-containers-with-dbfs)
dbutils.fs.mount(
 source = "wasbs://" + container + "@" + storage_account + ".blob.core.windows.net",
 mount_point = mount_point_name,
 extra_configs = {"fs.azure.account.key." + storage_account + ".blob.core.windows.net":storage_key})

In [4]:
import numpy as np
import pandas as pd

from pyspark.sql import functions as f, types as t
from pyspark.sql.functions import udf

# ML
from sentinel_ai.peer_anomaly import spark_collaborative_filtering as scf 

# spark
from sentinel_ai.utils import sparkutils

#utils
from azure_sentinel_ml_utilities.azure_storage import blob_manager

In [5]:
checkpoint_dir = mount_point_name + 'cache/{0}/checkpoints'.format(project)
dbutils.fs.mkdirs(checkpoint_dir)
sparkutils.set_checkpointdir(spark, checkpoint_dir)

# Load Dataset

In [7]:
class FileShareDataset:
  
    def __init__(self, storage_account, storage_key):
      self.storage_account = storage_account
      self.blob_manager = blob_manager(storage_account, storage_key)
      # Spark conf set for spark.read.csv to work
      spark.conf.set(
        "fs.azure.account.key." + storage_account + ".blob.core.windows.net",
        storage_key)

    @staticmethod
    def get_schema():
      return t.StructType([
        t.StructField('Timestamp', t.TimestampType()),
        t.StructField('Actor', t.StringType()),
        t.StructField('Resource', t.StringType()),
        t.StructField('categoricalFeatures', t.StringType()),
        t.StructField('count_', t.IntegerType())
      ])

    @staticmethod
    def _make_days_delta():
      @udf('double')
      def days_delta(d2, d1):
        return 1.0 + (d2 - d1).days

      return days_delta

    def get_raw_df(self, start_time, end_time, container, root, use_schema=True):
        
      blob_names = self.blob_manager.enumerate_blob_names(start_time, end_time, container, root)
      full_blob_names = ["wasbs://" + container + "@" + self.storage_account + ".blob.core.windows.net/" + bn for bn in blob_names]
      
      schema = FileShareDataset.get_schema() if use_schema else None
      
      if use_schema:
          return spark.read.csv(full_blob_names, schema=schema, sep='\t', header=False)
      else:
          return spark.read.csv(full_blob_names, sep='\t', header=False)

    def processed_df(self, df):
        dd = FileShareDataset._make_days_delta()

        return df.select(
            f.col('Timestamp').alias('timestamp1'),
            f.col('Timestamp').alias('timestamp2'),
            'Actor',
            'Resource',
            'count_'
        ).groupBy(
            'Actor',
            'Resource'
        ).agg({
            'timestamp1': 'min',
            'timestamp2': 'max',
            'count_': 'sum'
        }).select(
            f.lit('0').alias('tid'),
            f.col('min(timestamp1)').alias('min_timestamp'),
            f.col('max(timestamp2)').alias('max_timestamp'),
            f.col('Actor').alias('user'),
            f.col('Resource').alias('res'),
            (f.col('sum(count_)')/dd(f.col('max(timestamp2)'), f.col('min(timestamp1)'))).alias('score')
        )

    def get_dataset(self, start_time, end_time, container, root):
        return self.processed_df(self.get_raw_df(start_time, end_time, container, root)).cache()

In [8]:
def getdataset():
  return FileShareDataset(storage_account, storage_key).get_dataset(train_start_time, train_end_time, container, train_base_path)

In [9]:
ptraining = getdataset()

In [10]:
print(ptraining.first())

In [11]:
print(ptraining.select('tid').distinct().count())

In [12]:
ptraining.describe().show()

In [13]:
sparkutils.df_stats(ptraining)

# Build Model

In [15]:
# Model building
access_anomaly = scf.AccessAnomaly(tenant_colname='tid', score_colname='score')
access_anomaly_model = access_anomaly.fit(ptraining)

# Save Model

In [17]:
model_output = '{root}/{project}/model_output'.format(root=mount_point_name + 'models/', project=project)
print(model_output)

In [18]:
access_anomaly_model.save(
    '{model_output}/access_anomaly_model'.format(model_output=model_output)
)

In [19]:
# unmount blob storage
dbutils.fs.unmount(mount_point_name)