This notebook is used for training anomalous resource access model.  The data used here is File Share Access Events from Windows machine. Data is loaded from a Blob Storage Container.
The trained model is then saved to the Blob Storage, which can then be used by the Scoring Notebook

Steps:
   0. One-time: Install the following packages on the cluster (refer: https://forums.databricks.com/questions/680/how-to-install-python-package-on-spark-cluster.html)
        - com.microsoft.ml.spark:mmlspark_2.11:1.0.0 from https://mmlspark.azureedge.net
        - azure_sentinel_utilities whl package
        - azure-storage-blob (from PyPi - latest based on Azure SDK v12)
        - plotly (from PyPi)
        
   1. One-time: Set credentials in KeyVault so the notebook can access 
        - Storage Account
   2. Ensure the settings in the first cell below are filled in.
   3. Run the Notebook to produce the model
   
 One-time: (Setting up Storage Key in KeyVault)
    - (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#access-azure-blob-storage-directly)
 
 Storing and retrieving secrets: 
    - Using Azure KeyVault:- https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#akv-ss

In [2]:
import datetime as dt

# Storage Account Connection String
storage_conn_str = dbutils.secrets.get(scope = 'YOUR_SCOPE_HERE', key = 'YOUR_KEY_HERE')

# Workspace Resource Id of your Sentinel workspace
workspaceResourceId = 'YOUR_WORKSPACE_RESOURCE_ID_HERE' # eg: /subscriptions/<sub_guid>/resourcegroups/<rg_name>/providers/microsoft.operationalinsights/workspaces/<wks_name>'
mount_point_name = 'YOUR_MOUNT_POINT_HERE' # any name
# Project name
project = 'YOUR_PROJECT_HERE' # any name

###
### Note that when training periodically, you specify time range relative to current time as specified in the commented section
###
# Time range for training
# train_start_time = dt.datetime.now() - dt.timedelta(days=20)
# train_end_time = dt.datetime.now() - dt.timedelta(days=10)
train_start_time = dt.datetime.strptime('2020-09-03 00:00', '%Y-%m-%d %H:%M') 
train_end_time = dt.datetime.strptime('2020-09-04 00:00', '%Y-%m-%d %H:%M') 

print (train_start_time)
print (train_end_time)

In [3]:
import re

#extract storage account and key from connection string
key_pattern = 'DefaultEndpointsProtocol=(\w+);AccountName=(\w+);AccountKey=([^;]+);'
match = re.match(key_pattern, storage_conn_str)
storage_account = match.group(2)
storage_key = match.group(3)

print (storage_account)

container = 'am-securityevent' # This name is fixed for security events

train_base_path = 'WorkspaceResourceId={workspaceResourceId}'.format(workspaceResourceId=workspaceResourceId)
print(train_base_path)

In [4]:
###
### You can do this one-time in a separate Notebook, so that you don't cause accidental errors in other Notebooks mounting/unmounting the folder
###

# Mount the Storage Container
#    (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#mount-azure-blob-storage-containers-with-dbfs)
dbutils.fs.mount(
 source = "wasbs://" + container + "@" + storage_account + ".blob.core.windows.net",
 mount_point = mount_point_name,
 extra_configs = {"fs.azure.account.key." + storage_account + ".blob.core.windows.net":storage_key})

In [5]:
from pyspark.sql import functions as f, types as t
from pyspark.sql.functions import udf
from mmlspark.cyber.anomaly.collaborative_filtering import AccessAnomaly
from azure_sentinel_utilities.azure_storage import storage_blob_manager

In [6]:
checkpoint_dir = mount_point_name + 'cache/{0}/checkpoints'.format(project)
dbutils.fs.mkdirs(checkpoint_dir)
spark.sparkContext.setCheckpointDir(checkpoint_dir)

# Load Dataset

In [8]:
#
# This class is used to process 'file share access' related events from Security Events
#
class FileShareDataset:
  
    def __init__(self, storage_conn_str):
        self.storage_conn_str = storage_conn_str
        self.storage_blob_manager = storage_blob_manager(storage_conn_str)

    @staticmethod
    def _make_days_delta():
        @udf('double')
        def days_delta(d2, d1):
            return 1.0 + (d2 - d1).days
        return days_delta
    
    # NOTE that there are a lot more fields for security events. Below we are picking up only a subset of fields
    @staticmethod
    def _security_event_schema():
        return t.StructType([
            t.StructField(name = "Account", dataType = t.StringType(), nullable = True),
            t.StructField(name = "ShareName", dataType = t.StringType(), nullable = True),
            t.StructField(name = "ShareLocalPath", dataType = t.StringType(), nullable = True),
            t.StructField(name = "AccountType", dataType = t.StringType(), nullable = True),
            t.StructField(name = "Computer", dataType = t.StringType(), nullable = True),
            t.StructField(name = "EventID", dataType = t.StringType(), nullable = True),
            t.StructField(name = "EventData", dataType = t.StringType(), nullable = True),
            t.StructField(name = "NewProcessId", dataType = t.StringType(), nullable = True),
            t.StructField(name = "NewProcessName", dataType = t.StringType(), nullable = True),
            t.StructField(name = "ParentProcessName", dataType = t.StringType(), nullable = True),
            t.StructField(name = "Process", dataType = t.StringType(), nullable = True),
            t.StructField(name = "ProcessId", dataType = t.StringType(), nullable = True),
            t.StructField(name = "SourceComputerId", dataType = t.StringType(), nullable = True),
            t.StructField(name = "SourceSystem", dataType = t.StringType(), nullable = True),
            t.StructField(name = "SubjectAccount", dataType = t.StringType(), nullable = True),
            t.StructField(name = "SubjectDomainName", dataType = t.StringType(), nullable = True),
            t.StructField(name = "SubjectLogonId", dataType = t.StringType(), nullable = True),
            t.StructField(name = "SubjectUserName", dataType = t.StringType(), nullable = True),
            t.StructField(name = "SubjectUserSid", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TargetAccount", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TargetDomainName", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TargetLogonId", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TargetUserName", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TargetUserSid", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TenantId", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TimeCollected", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TimeGenerated", dataType = t.StringType(), nullable = True),
            t.StructField(name = "TokenElevationType", dataType = t.StringType(), nullable = True),
        ])

    # Get file share access data from security events
    def get_fs_dataset(self, start_time, end_time, container, root):   
        raw_df = self.storage_blob_manager.get_raw_df(
                                        start_time, 
                                        end_time, 
                                        container, 
                                        root, 
                                        FileShareDataset._security_event_schema(), 
                                        storage_blob_manager.get_blob_service_client(self.storage_conn_str) )
        # Get FileShare access events
        return raw_df.where(
                    f.col('EventID') == '5140'
                 ).select (
                    f.lit('0').alias('tenant_id'),
                    f.col('TimeGenerated'),
                    f.to_date(f.col('TimeGenerated').cast('timestamp')).cast('timestamp').alias('Timestamp'), # timestamp is set at day 00:00
                    f.col('Account').alias('user'),
                    f.col('ShareName').alias('res'),
                 )
    
    # group the file share access per day and assign an initial likelyhood score
    def get_processed_fs_dataset(self, start_time, end_time, container, root):
        dd = FileShareDataset._make_days_delta()

        df_fs = self.get_fs_dataset(start_time, end_time, container, root)
        
        # group fileshare access events per day
        daily_fs_activity = df_fs.groupBy(
                                'tenant_id',
                                'Timestamp',
                                'user',
                                'res'
                            ).count()
        
        # Calculate an initial likelihood score based on count of events
        return daily_fs_activity.select(
            f.col('tenant_id'),
            f.col('Timestamp').alias('timestamp1'),
            f.col('Timestamp').alias('timestamp2'),
            'user',
            'res',
            'count'
        ).groupBy(
            'tenant_id',
            'user',
            'res'
        ).agg({
            'timestamp1': 'min',
            'timestamp2': 'max',
            'count': 'sum'
        }).select(
            f.col('tenant_id'),
            f.col('min(timestamp1)').alias('min_timestamp'),
            f.col('max(timestamp2)').alias('max_timestamp'),
            f.col('user'),
            f.col('res'),
            (f.col('sum(count)')/dd(f.col('max(timestamp2)'), f.col('min(timestamp1)'))).alias('likelihood')
        )

In [9]:
def getdataset():
    return FileShareDataset(storage_conn_str).get_processed_fs_dataset(train_start_time, train_end_time, container, train_base_path)

In [10]:
# load the training data
ptraining = getdataset()

In [11]:
print(ptraining.first())

In [12]:
print(ptraining.select('tenant_id').distinct().count())

In [13]:
ptraining.describe().show()

# Build Model

In [15]:
# Model building
access_anomaly = AccessAnomaly(
                    tenantCol='tenant_id',
                    userCol='user',
                    resCol='res',
                    likelihoodCol='likelihood',
                    maxIter=100
                    )
access_anomaly_model = access_anomaly.fit(ptraining)

# Save Model

In [17]:
model_output = '{root}/{project}/model_output'.format(root=mount_point_name + 'models', project=project)
print(model_output)

In [18]:
access_anomaly_model.save(
    '{model_output}/access_anomaly_model'.format(model_output=model_output)
)

In [19]:
#unmount blob storage
dbutils.fs.unmount(mount_point_name)