This notebook is used for scoring using anomalous resource access model. The model was saved by the training Notebook.  This Notebook runs on a schedule, loads the model and score new events. The data used here is File Share Access Events from Windows machine. Data is loaded from a Blob Storage Container. The top scored results are submitted to Log Analytics.

Steps:
   0. One-time: Install the following packages on your cluster by navigating to the 'Clusters' tab on the left
        - sentinel_ai (whl package from GitHub Utilities folder)
        - azure_sentinel_ml_utilities (whl package from GitHub Utilities folder)
        - azure-storage-blob (from PyPi)
        - scikit-surprise (from PyPi)
        - numpy==1.15.0 (from PyPi)
        - pyarrow==0.12.0 (from PyPi)
        - plotly (from PyPi)
        
   1. One-time: Set credentials in KeyVault so the notebook can access 
        - Storage Account
        - Log Analytics
   2. Ensure the relative paths to Blob Storage are correct.
   3. Set the Notebook to run on a schedule to score and submit results to LA.
   
 One-time: (Setting up Storage Key & Log Analytics Key in KeyVault)
    - (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#access-azure-blob-storage-directly)
     
 Storing and retrieving secrets: 
    - Using Azure KeyVault:- https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#akv-ss

In [2]:
import datetime as dt

# Storage Account information
storage_account = 'YOUR STORAGE ACCOUNT HERE'
storage_key = dbutils.secrets.get(scope = 'NAME HERE', key = 'KEY NAME HERE')
container = 'CONTAINER NAME HERE'
mount_point_name = 'MOUNT POINT NAME HERE'

test_base_path = 'PATHNAME HERE'

# Log Analytics WorkSpace Info (ASI)
workspace_id = 'YOUR LOG ANALYTICS WORKSPACE ID HERE'
# For the shared key, use either the primary or the secondary key of the workspace
workspace_shared_key = dbutils.secrets.get(scope = 'NAME HERE', key = 'KEY NAME HERE')
# Project name
project = 'PROJECT NAME HERE'

###
### Note that when scheduling periodically, you specify time range relative to current time as specified in the commented section below
###
# Time range for testing
# test_start_time = dt.datetime.now() - dt.timedelta(hours=1)
# test_end_time = dt.datetime.now()

test_start_time = dt.datetime.strptime('Jan 20 2019', '%b %d %Y') 
test_end_time = dt.datetime.strptime('Jan 24 2019', '%b %d %Y') 

model_path = '{root}/{project}/model_output'.format(root=mount_point_name + 'models/', project=project)

In [3]:
###
### You can do this one-time in a separate Notebook, so that you don't cause accidental errors in other Notebooks mounting/unmounting the folder
###

# Mount the Storage Container
#    (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#mount-azure-blob-storage-containers-with-dbfs)
dbutils.fs.mount(
  source = "wasbs://" + container + "@" + storage_account + ".blob.core.windows.net",
  mount_point = mount_point_name,
  extra_configs = {"fs.azure.account.key." + storage_account + ".blob.core.windows.net":storage_key})

In [4]:
import numpy as np
import pandas as pd

from pyspark.sql import functions as f, types as t
from pyspark.sql.functions import udf

# ML
from sentinel_ai.peer_anomaly import spark_collaborative_filtering as scf 

# spark
from sentinel_ai.utils import sparkutils

#utils
from azure_sentinel_ml_utilities.azure_storage import blob_manager
from azure_sentinel_ml_utilities.log_analytics import log_analytics_client

#Load saved model

In [6]:
access_anomaly_model = scf.AccessAnomalyModel.load(
    spark, 
    '{model_path}/access_anomaly_model'.format(model_path=model_path)
)

# Dataset

In [8]:
class FileShareDataset:
  
    def __init__(self, storage_account, storage_key):
      self.storage_account = storage_account
      self.blob_manager = blob_manager(storage_account, storage_key)
      # Spark conf set for spark.read.csv to work
      spark.conf.set(
        "fs.azure.account.key." + storage_account + ".blob.core.windows.net",
        storage_key)

    @staticmethod
    def get_schema():
      return t.StructType([
        t.StructField('Timestamp', t.TimestampType()),
        t.StructField('Actor', t.StringType()),
        t.StructField('Resource', t.StringType()),
        t.StructField('categoricalFeatures', t.StringType()),
        t.StructField('count_', t.IntegerType())
      ])

    def get_raw_df(self, start_time, end_time, container, root, use_schema=True):       
      blob_names = self.blob_manager.enumerate_blob_names(start_time, end_time, container, root)
      full_blob_names = ["wasbs://" + container + "@" + self.storage_account + ".blob.core.windows.net/" + bn for bn in blob_names]
      
      schema = FileShareDataset.get_schema() if use_schema else None
      
      if use_schema:
          return spark.read.csv(full_blob_names, schema=schema, sep='\t', header=False)
      else:
          return spark.read.csv(full_blob_names, sep='\t', header=False)


In [9]:
def getdataset():
  fileShareDataset = FileShareDataset(storage_account, storage_key)
  return (
    fileShareDataset.get_raw_df(test_start_time, test_end_time, container, test_base_path).select(
      f.lit('0').alias('tid'),
      f.col('Timestamp').alias('timestamp'),
      f.col('Actor').alias('user'),
      f.col('Resource').alias('res'),
      f.col('categoricalFeatures').alias('categorical_features')
    ).cache()
  )

In [10]:
ptesting = getdataset()

In [11]:
print(ptesting.first())

In [12]:
ptesting.describe().show()

# Scoring

In [14]:
pred_df = access_anomaly_model.transform(ptesting)

In [15]:
pred_df.first()

In [16]:
pred_df.select('predicted_score').describe().show()

In [17]:
# report results

In [18]:
full_res_df = pred_df.orderBy(f.desc('predicted_score'))

In [19]:
full_res_df.first()

In [20]:
# Check score of a simulated anomolous user access

#anomalous_user_access = full_res_df.filter(full_res_df.user.like('Domain_282/User_871048'))
#display(anomalous_user_access)

##Filter out commonly seen users (automation account that are known to access File Shares)

In [22]:
# If there are automation user accounts that access different shares and can cause false positives then filter such users out
usersToFilter = ['Domain_346/User_870818', 'Domain_348/User_231659']
filtered_result = full_res_df.filter(full_res_df.user.isin(*usersToFilter) == False)
filtered_result = filtered_result.where(f.col('user').endswith('User_255625') == False) # automation user in all domains
print(full_res_df.count())
print(filtered_result.count())

##Rank top anomalous users

In [24]:
#
# Select a subset of results to send to Log Analytics
#
from pyspark.sql.window import Window

w = Window.partitionBy(
                  'tid',
                  'res',
                  'user'
                ).orderBy(
                  f.desc('predicted_score')
                )

# select values above threshold
results_above_threshold = filtered_result.filter(filtered_result.predicted_score > 7.75)

# get distinct resource/user and corresponding timestamp and highest score
results_to_la = results_above_threshold.withColumn(
                  'index', f.row_number().over(w)
                  ).orderBy(
                    f.desc('predicted_score')
                  ).select(
                    'tid',
                    f.col('res').alias('Resource'),
                    f.col('user').alias('Actor'),
                    'categorical_features',
                    'timestamp',
                    'predicted_score'
                  ).where(
                    'index == 1'
                  ).limit(25)
  
display(results_to_la)

#Write top anomalous scores to Sentinel

In [26]:
def send_results_to_log_analytics(df_to_la):
  # The log type is the name of the event that is being submitted.  This will show up under "Custom Logs" as log_type + '_CL'
  log_type = 'AnomalousResourceAccessResult'

  # concatenate columns to form one json record
  json_records = df_to_la.withColumn('json_field', f.concat(f.lit('{'), 
                                            f.lit(' \"TimeStamp\": \"'), f.from_unixtime(f.unix_timestamp(f.col("timestamp")), "y-MM-dd'T'hh:mm:ss.SSS'Z'"), f.lit('\",'),
                                            f.lit(' \"Actor\": \"'), f.col('Actor'), f.lit('\",'),
                                            f.lit(' \"Resource\": \"'), f.col('Resource'), f.lit('\",'),
                                            f.lit(' \"PredictedScore\":'), f.col('predicted_score'),
                                            f.lit('}')
                                           )                       
                                         )
  # combine json record column to create the array
  json_body = json_records.agg(f.concat_ws(", ", f.collect_list('json_field')).alias('body'))

  if len(json_body.first()) > 0:
    json_payload = json_body.first()['body']
    json_payload = '[' + json_payload + ']'

    payload = json_payload.encode('utf-8') #json.dumps(json_payload)
    # print(payload)
    return log_analytics_client(workspace_id, workspace_shared_key).post_data(payload, log_type)
  else:
    return "No json data to send to LA"

count = results_to_la.count()
if count > 0:
  print ('Results count = ', count)
  result = send_results_to_log_analytics(results_to_la)
  print("Writing to Log Analytics result: ", result)
else:
  print ('No results to send to LA')

In [27]:
# users that were not in the training set
never_seen_users = full_res_df.where(f.col('predicted_score').isNull()).select(f.col('user')).distinct()

print('Count never seen users:', never_seen_users.count())
display(never_seen_users)

In [28]:
def print_ratio(df, thr):
    print('ratio of above {0} items {1}/{2} = {3}%'.format(
        thr,
        df.filter(f.col('predicted_score') > thr).count(),
        df.count(),
        100.0*df.filter(f.col('predicted_score') > thr).count()/df.count()
    ))
    
print_ratio(full_res_df, 0)
print_ratio(full_res_df, 2.5)
print_ratio(full_res_df, 5)
print_ratio(full_res_df, 7.5)

#Display all resource accesses by users with highest anomalous score

In [30]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, offline
print (__version__) # requires version >= 1.9.0

# run plotly in offline mode
offline.init_notebook_mode()

In [31]:
#Find all server accesses of users with high predicted scores
# For display, limit to top 25 results
results_to_display = results_to_la.orderBy(
                  f.desc('predicted_score')
                ).limit(15)
interesting_users = filtered_result.join(results_to_display, f.col('user') == f.col('Actor'), "inner")
fileShare_accesses = interesting_users.groupBy(
                          'user', 
                          'res'
                        ).agg(
                          f.count('*').alias('count'),
                        ).select(
                          f.col('user').alias('Actor'),
                          f.col('res').alias('Resource'),
                          'count'
                        )

# get unique users and file shares
high_scores_df = fileShare_accesses.toPandas()
unique_arr = np.append(high_scores_df.Actor.unique(), high_scores_df.Resource.unique())

unique_df = pd.DataFrame(data = unique_arr, columns = ['name'])
unique_df['index'] = range(0, len(unique_df.index))

# create index for source & target and color for the normal accesses
normal_line_color = 'rgba(211, 211, 211, 0.8)'
anomolous_color = 'red'
x = pd.merge(high_scores_df, unique_df, how='left', left_on='Actor', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'ActorIndex'})
all_access_index_df = pd.merge(x, unique_df, how='left', left_on='Resource', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'ResourceIndex'})
all_access_index_df['color'] = normal_line_color

# results_to_display index, color and 
y = results_to_display.toPandas().drop(['tid', 'categorical_features', 'timestamp', 'predicted_score'], axis=1)
y = pd.merge(y, unique_df, how='left', left_on='Actor', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'ActorIndex'})
high_scores_index_df = pd.merge(y, unique_df, how='left', left_on='Resource', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'ResourceIndex'})
high_scores_index_df['count'] = 1
high_scores_index_df['color'] = anomolous_color

# substract 1 for the red entries in all_access df
hsi_df = high_scores_index_df[['Actor','Resource', 'count']].rename(columns={'count' : 'hsiCount'})
all_access_updated_count_df = pd.merge(all_access_index_df, hsi_df, how='left', left_on=['Actor', 'Resource'], right_on=['Actor', 'Resource'])
all_access_updated_count_df['count'] = np.where(all_access_updated_count_df['hsiCount']==1, all_access_updated_count_df['count'] - 1, all_access_updated_count_df['count'])
all_access_updated_count_df = all_access_updated_count_df.loc[all_access_updated_count_df['count'] > 0]
all_access_updated_count_df = all_access_updated_count_df[['Actor','Resource', 'count', 'ActorIndex', 'ResourceIndex', 'color']]

# combine the two tables
frames = [all_access_updated_count_df, high_scores_index_df]
display_df = pd.concat(frames)
# display_df.head()

In [32]:
data_trace = dict(
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label = unique_df['name'].dropna(axis=0, how='any')
    ),
    link = dict(
      source = display_df['ActorIndex'].dropna(axis=0, how='any'),
      target = display_df['ResourceIndex'].dropna(axis=0, how='any'),
      value = display_df['count'].dropna(axis=0, how='any'),
      color = display_df['color'].dropna(axis=0, how='any'),
  )
)

layout =  dict(
    title = "All resources accessed by users with highest anomalous scores",
    height = 772,
    font = dict(
      size = 10
    ),    
)

fig = dict(data=[data_trace], layout=layout)

p = plot(fig, output_type='div')

displayHTML(p)

In [33]:
# unmount blob storage
dbutils.fs.unmount(mount_point_name)