This notebook demonstrates the use of Anomalous Resource Access model in Sentinel.  It generates training and testing data, trains the Anomalous Resource Access model and uses it to score the test data.  The top predicted scores are submitted to Sentinel workspace.

Steps:
   0. One-time: Install the following packages on the cluster (refer: https://forums.databricks.com/questions/680/how-to-install-python-package-on-spark-cluster.html)
        - com.microsoft.ml.spark:mmlspark_2.11:1.0.0 from https://mmlspark.azureedge.net
        - azure_sentinel_utilities whl package
        - plotly (from PyPi)
        
   1. One-time: Set credentials in KeyVault so the notebook can access 
        - Log Analytics

 Storing and retrieving secrets: 
    - Using Azure KeyVault:- https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#akv-ss

# Initialization

In [3]:
# Specify the Log Analytics WorkSpaceId (of your Sentinel instance).  The workspacekey should be kept in the KeyVault as the best security practice

#Log Analytics WorkSpace (Sentinel)
workspace_id = 'YOUR_WORKSPACE_ID_HERE'

# For the shared key, use either the primary or the secondary key of the workspace
workspace_shared_key = dbutils.secrets.get(scope = 'YOUR_SCOPE_HERE', key = 'YOUR_KEY_HERE')

In [4]:
from mmlspark.cyber.dataset import DataFactory 
from mmlspark.cyber.anomaly.collaborative_filtering import AccessAnomaly

from pyspark.sql import functions as f, types as t
import numpy as np
import pandas as pd

#utils
from azure_sentinel_utilities.log_analytics import log_analytics_client

In [5]:
spark.sparkContext.setCheckpointDir('dbfs:/checkpoint_path/')

In [6]:
# The following module generates random training and testing data sets. Two test data sets are generated, one with low anomaly and the other with high anomaly compared to the training
# data. Note that the data generated has a 'score' field that is a seed value for training. When working with real data, you will have a timestamp that you will need to use to
# calculate a score based on the aggregation of access over a time interval (hourly or daily).

factory = DataFactory(
  num_hr_users = 25,
  num_hr_resources = 50,
  num_fin_users = 35,
  num_fin_resources = 75,
  num_eng_users = 15,
  num_eng_resources = 25,
  single_component = True
)

training_pdf = factory.create_clustered_training_data(ratio=0.4)

training_df = spark.createDataFrame(training_pdf)
ingroup_df = spark.createDataFrame(factory.create_clustered_intra_test_data(training_pdf))
outgroup_df = spark.createDataFrame(factory.create_clustered_inter_test_data())

In [7]:
# This module does 2 things:
#  a) Gives a default tenant ID of 0
#  b) Adds a count for number of access (for displaying the graph)
def updateValues(df, use_random_count):
  tmp_df = df.withColumn('tenant_id', f.lit(0))
  if use_random_count:
     return tmp_df.withColumn('count_', f.round(1+f.rand()*10))
  else:
     return tmp_df.withColumn('count_', f.lit(1))


training_df = updateValues(training_df, True)
ingroup_df = updateValues(ingroup_df, False)
outgroup_df = updateValues(outgroup_df, False)

In [8]:
training_df.show()

In [9]:
# Stats of each dataframes
print(training_df.count())
print(ingroup_df.count())
print(outgroup_df.count())

In [10]:
training_df.describe().show()

# Training

In [12]:
access_anomaly = AccessAnomaly(
  tenantCol='tenant_id',
  userCol='user',
  resCol='res',
  likelihoodCol='likelihood',
  maxIter=1000
)

In [13]:
model = access_anomaly.fit(training_df)

# Testing

In [15]:
# Score the low anomaly test dataset
ingroup_scored_df = model.transform(ingroup_df)

In [16]:
ingroup_scored_df.agg(
  f.min('anomaly_score').alias('min_anomaly_score'),
  f.max('anomaly_score').alias('max_anomaly_score'),
  f.mean('anomaly_score').alias('mean_anomaly_score'),
  f.stddev('anomaly_score').alias('stddev_anomaly_score'),
).show()

In [17]:
# Score the high anomaly test dataset
outgroup_scored_df = model.transform(outgroup_df)

In [18]:
outgroup_scored_df.agg(
  f.min('anomaly_score').alias('min_anomaly_score'),
  f.max('anomaly_score').alias('max_anomaly_score'),
  f.mean('anomaly_score').alias('mean_anomaly_score'),
  f.stddev('anomaly_score').alias('stddev_anomaly_score'),
).show()

### Report results

In [20]:
full_res_df = outgroup_scored_df.orderBy(f.desc('anomaly_score')).cache()

In [21]:
display(full_res_df)

In [22]:
def print_ratio(df, thr):
    print('ratio of above {0} items {1}/{2} = {3}%'.format(
        thr,
        df.filter(f.col('anomaly_score') > thr).count(),
        df.count(),
        100.0*df.filter(f.col('anomaly_score') > thr).count()/df.count()
    ))
    
print_ratio(full_res_df, 0)
print_ratio(full_res_df, 15.0)
print_ratio(full_res_df, 15.5)
print_ratio(full_res_df, 15.7)

#Rank anomalous users

In [24]:
#
# Select a subset of results to send to Log Analytics
#
from pyspark.sql.window import Window

w = Window.partitionBy(
                  'tenant_id',
                  'user',
                  'res'  
                ).orderBy(
                  f.desc('anomaly_score')
                )

# select values above threshold
results_above_threshold = full_res_df.filter(full_res_df.anomaly_score > 1.0)

# get distinct resource/user and corresponding timestamp and highest score
results_to_la = results_above_threshold.withColumn(
                  'index', f.row_number().over(w)
                  ).orderBy(
                    f.desc('anomaly_score')
                  ).select(
                    'tenant_id',
                    f.col('user'),
                    f.col('res'),
                    'anomaly_score'
                  ).where(
                    'index == 1'
                  ).limit(100).cache()

# add a fake timestamp to the results
results_to_la = results_to_la.withColumn('timestamp', f.current_timestamp())
  
display(results_to_la)

#Display all resource accesses by users with highest anomalous score

In [26]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, offline
print (__version__) # requires version >= 1.9.0

# run plotly in offline mode
offline.init_notebook_mode()

In [27]:
#Find all server accesses of users with high predicted scores
# For display, limit to top 25 results
results_to_display = results_to_la.orderBy(
                  f.desc('anomaly_score')
                ).limit(25).cache()
interesting_records = full_res_df.join(results_to_display, ['user'], 'left_semi')
non_anomalous_records = interesting_records.join(results_to_display, ['user', 'res'], 'left_anti')

top_non_anomalous_records = non_anomalous_records.groupBy(
                          'tenant_id',
                          'user', 
                          'res'
                        ).agg(
                          f.count('*').alias('count'),
                        ).select(
                          f.col('tenant_id'),
                          f.col('user'),
                          f.col('res'),
                          'count'
                        )

#pick only a subset of non-anomalous record for UI
w = Window.partitionBy(
                  'tenant_id',
                  'user',
                ).orderBy(
                  f.desc('count')
                )

# pick top non-anomalous set
top_non_anomalous_accesses = top_non_anomalous_records.withColumn(
                  'index', f.row_number().over(w)
                  ).orderBy(
                    f.desc('count')
                  ).select(
                    'tenant_id',
                    f.col('user'),
                    f.col('res'),
                    f.col('count')
                  ).where(
                    'index in (1,2,3,4,5)'
                  ).limit(25)

# add back anomalous record
fileShare_accesses = (top_non_anomalous_accesses
                          .select('user', 'res', 'count')
                          .union(results_to_display.select('user', 'res', f.lit(1).alias('count'))).cache())

In [28]:
# get unique users and file shares
high_scores_df = fileShare_accesses.toPandas()
unique_arr = np.append(high_scores_df.user.unique(), high_scores_df.res.unique())

unique_df = pd.DataFrame(data = unique_arr, columns = ['name'])
unique_df['index'] = range(0, len(unique_df.index))

# create index for source & target and color for the normal accesses
normal_line_color = 'rgba(211, 211, 211, 0.8)'
anomolous_color = 'red'
x = pd.merge(high_scores_df, unique_df, how='left', left_on='user', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'userIndex'})
all_access_index_df = pd.merge(x, unique_df, how='left', left_on='res', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'resIndex'})
all_access_index_df['color'] = normal_line_color

# results_to_display index, color and 
y = results_to_display.toPandas().drop(['tenant_id', 'timestamp', 'anomaly_score'], axis=1)
y = pd.merge(y, unique_df, how='left', left_on='user', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'userIndex'})
high_scores_index_df = pd.merge(y, unique_df, how='left', left_on='res', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'resIndex'})
high_scores_index_df['count'] = 1
high_scores_index_df['color'] = anomolous_color

# substract 1 for the red entries in all_access df
hsi_df = high_scores_index_df[['user','res', 'count']].rename(columns={'count' : 'hsiCount'})
all_access_updated_count_df = pd.merge(all_access_index_df, hsi_df, how='left', left_on=['user', 'res'], right_on=['user', 'res'])
all_access_updated_count_df['count'] = np.where(all_access_updated_count_df['hsiCount']==1, all_access_updated_count_df['count'] - 1, all_access_updated_count_df['count'])
all_access_updated_count_df = all_access_updated_count_df.loc[all_access_updated_count_df['count'] > 0]
all_access_updated_count_df = all_access_updated_count_df[['user','res', 'count', 'userIndex', 'resIndex', 'color']]

# combine the two tables
frames = [all_access_updated_count_df, high_scores_index_df]
display_df = pd.concat(frames, sort=True)
# display_df.head()

In [29]:
data_trace = dict(
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label = unique_df['name'].dropna(axis=0, how='any')
    ),
    link = dict(
      source = display_df['userIndex'].dropna(axis=0, how='any'),
      target = display_df['resIndex'].dropna(axis=0, how='any'),
      value = display_df['count'].dropna(axis=0, how='any'),
      color = display_df['color'].dropna(axis=0, how='any'),
  )
)

layout =  dict(
    title = "All resources accessed by users with highest anomalous scores",
    height = 772,
    font = dict(
      size = 10
    ),    
)

fig = dict(data=[data_trace], layout=layout)

p = plot(fig, output_type='div')

displayHTML(p)

In [30]:
@udf
def escape_str(str):
  return str.replace('\\','\\\\')

def send_results_to_log_analytics(df_to_la):
  # The log type is the name of the event that is being submitted.  This will show up under "Custom Logs" as log_type + '_CL'
  log_type = 'AnomalousResourceAccessResult'

  # concatenate columns to form one json record
  json_records = df_to_la.withColumn('json_field', f.concat(f.lit('{'), 
                                            f.lit(' \"TimeStamp\": \"'), f.from_unixtime(f.unix_timestamp(f.col("timestamp")), "y-MM-dd'T'hh:mm:ss.SSS'Z'"), f.lit('\",'),
                                            f.lit(' \"User\": \"'), escape_str(f.col('user')), f.lit('\",'),
                                            f.lit(' \"Resource\": \"'), escape_str(f.col('res')), f.lit('\",'),
                                            f.lit(' \"AnomalyScore\":'), f.col('anomaly_score'),
                                            f.lit('}')
                                           )                       
                                         )
  # combine json record column to create the array
  json_body = json_records.agg(f.concat_ws(", ", f.collect_list('json_field')).alias('body'))

  if len(json_body.first()) > 0:
    json_payload = json_body.first()['body']
    json_payload = '[' + json_payload + ']'

    payload = json_payload.encode('utf-8') #json.dumps(json_payload)
    # print(payload)
    return log_analytics_client(workspace_id, workspace_shared_key).post_data(payload, log_type)
  else:
    return "No json data to send to LA"

count = results_to_la.count()
if count > 0:
  print ('Results count = ', count)
  result = send_results_to_log_analytics(results_to_la)
  print("Writing to Log Analytics result: ", result)
else:
  print ('No results to send to LA')