# HELK: Checking Spark and Graphframe Integrations
----------------------------------------------------------------------------
## Goals:
* Test if Jupyter can talk to Spark & Graphframes
* Test if Spark & Graphframes can pull data from ES
* Show the basics of the HELK integrations with advanced analytics libraries

## Check the Spark Context via the variable sc 

In [1]:
spark

## Create a Spark RDD on top of Elasticsearch (logs-endpoint-winevent-sysmon-* as source)

In [2]:
es_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf={ 
        "es.resource" : "logs-endpoint-winevent-sysmon-*/doc",
        "es.nodes" : "172.18.0.2"
    })
es_rdd.first()

('1993576943',
 {'@meta': {'sysmon': {'timestamp': '2018-03-02T02:12:06.358Z'}},
  '@timestamp': '2018-03-02T02:12:06.385Z',
  '@version': '1',
  'action': 'ImageLoad',
  'beat': {'hostname': 'DESKTOP-29DJI4T',
   'name': 'DESKTOP-29DJI4T',
   'version': '6.1.2'},
  'event_id': 7,
  'hash_imphash': '23962D464328E40D1669494350F0B838',
  'hash_md5': '0370364D4D8846B6CF316ABBB2EDB083',
  'hash_sha1': '04F79F876D6CA5F941C242A085EE7B506B53B03D',
  'hash_sha256': '6F0FF65C9FB132FBC96AB55AB14D285CFA94B2EB90A24DA6CC72E0FD72C1ABFE',
  'host_name': 'DESKTOP-29DJI4T',
  'image_loaded': 'C:\\Windows\\System32\\user32.dll',
  'image_signature': 'Microsoft Windows',
  'image_signature_status': 'Valid',
  'image_signed': True,
  'level': 'Information',
  'log_name': 'Microsoft-Windows-Sysmon/Operational',
  'opcode': 'Info',
  'process_guid': 'A98268C1-B2F6-5A98-0000-001003F8AA00',
  'process_id': 7060,
  'process_name': 'SearchProtocolHost.exe',
  'process_path': 'C:\\Windows\\System32\\SearchProtoc

## Create a Spark RDD on top of Elasticsearch (logs-endpoint-winevent-security-* as source)

In [3]:
es_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf={ 
        "es.resource" : "logs-endpoint-winevent-security-*/doc",
        "es.nodes" : "172.18.0.2"
    })
es_rdd.first()

('909277456',
 {'@timestamp': '2018-03-02T05:05:48.098Z',
  '@version': '1',
  'activity_id': '{39A12F75-B12B-0000-AA30-A1392BB1D301}',
  'beat': {'hostname': 'DESKTOP-29DJI4T',
   'name': 'DESKTOP-29DJI4T',
   'version': '6.1.2'},
  'event_data': {},
  'event_id': 4624,
  'host_name': 'DESKTOP-29DJI4T',
  'impersonation_level': '%%1833',
  'keywords': ('Audit Success',),
  'level': 'Information',
  'log_name': 'Security',
  'logon_authentication_package': 'Negotiate',
  'logon_elevated_token': '%%1843',
  'logon_key_length': '0',
  'logon_package_name': '-',
  'logon_process_name': 'User32 ',
  'logon_restricted_adminmode': '-',
  'logon_transmitted_services': '-',
  'logon_type': '2',
  'logon_virtual_account': '%%1843',
  'message': 'An account was successfully logged on.\n\nSubject:\n\tSecurity ID:\t\tS-1-5-18\n\tAccount Name:\t\tDESKTOP-29DJI4T$\n\tAccount Domain:\t\tWORKGROUP\n\tLogon ID:\t\t0x3E7\n\nLogon Information:\n\tLogon Type:\t\t2\n\tRestricted Admin Mode:\t-\n\tVirtual A

## Import Graphframes package

In [4]:
from graphframes import *

In [5]:
# Create a Vertex DataFrame with unique ID column "id"
v = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



2

## Create a basic SparkSession

In [5]:
spark = SparkSession \
    .builder \
    .appName("HELK") \
    .config("es.read.field.as.array.include", "tags") \
    .config("es.nodes","172.18.0.2:9200") \
    .getOrCreate()

## Spark SQL Basic query (logs-endpoint-winevent-security-* as source)

In [6]:
df = spark.read.format("org.elasticsearch.spark.sql").load("logs-endpoint-winevent-security-*/doc")

In [7]:
df.printSchema()

root
 |-- @timestamp: timestamp (nullable = true)
 |-- @version: string (nullable = true)
 |-- activity_id: string (nullable = true)
 |-- beat: struct (nullable = true)
 |    |-- hostname: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- version: string (nullable = true)
 |-- dst_port_number: integer (nullable = true)
 |-- event_data: struct (nullable = true)
 |-- event_id: long (nullable = true)
 |-- host_name: string (nullable = true)
 |-- impersonation_level: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- level: string (nullable = true)
 |-- log_name: string (nullable = true)
 |-- logon_authentication_package: string (nullable = true)
 |-- logon_elevated_token: string (nullable = true)
 |-- logon_key_length: string (nullable = true)
 |-- logon_package_name: string (nullable = true)
 |-- logon_privileges_assigned: string (nullable = true)
 |-- logon_process_name: string (nullable = true)
 |-- logon_restricted_adminmode: string (nullab

In [8]:
df.select("task").show()

+--------------------+
|                task|
+--------------------+
|User Account Mana...|
|       Special Logon|
|               Logon|
|       Special Logon|
|               Logon|
|               Logon|
|              Logoff|
|              Logoff|
|               Logon|
+--------------------+

