# HELK: Checking Spark and Graphframe Integrations
----------------------------------------------------------------------------
## Goals:
* Test if Jupyter can talk to Spark & Graphframes
* Test if Spark & Graphframes can pull data from ES
* Show the basics of the HELK integrations with advanced analytics libraries

## Check the Spark Context via the variable sc 

In [1]:
sc

## Create a Spark RDD on top of Elasticsearch (logs-endpoint-winevent-sysmon-* as source)

In [20]:
es_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf={ 
        "es.resource" : "logs-endpoint-winevent-sysmon-*/doc",
        "es.nodes" : "172.18.0.2"
    })
es_rdd.first()

(u'2053487453',
 {u'@meta': {u'log': {u'timestamp': u'2018-02-20T17:16:29.294Z'}},
  u'@timestamp': u'2018-02-20T17:16:29.299Z',
  u'@version': u'1',
  u'action': u'processaccess',
  u'beat': {u'hostname': u'DESKTOP-29DJI4T',
   u'name': u'DESKTOP-29DJI4T',
   u'version': u'6.1.2'},
  u'computer_name': u'DESKTOP-29DJI4T',
  u'event_id': 10,
  u'level': u'Information',
  u'log_name': u'Microsoft-Windows-Sysmon/Operational',
  u'opcode': u'Info',
  u'process': {u'calltrace': u'C:\\WINDOWS\\SYSTEM32\\ntdll.dll+a0344|C:\\WINDOWS\\System32\\KERNELBASE.dll+3dc5d|C:\\ProgramData\\Microsoft\\Windows Defender\\Definition Updates\\{14FF058F-73CB-47DF-835D-8B578620CD35}\\mpengine.dll+ec56d|C:\\ProgramData\\Microsoft\\Windows Defender\\Definition Updates\\{14FF058F-73CB-47DF-835D-8B578620CD35}\\mpengine.dll+ec490|C:\\ProgramData\\Microsoft\\Windows Defender\\Definition Updates\\{14FF058F-73CB-47DF-835D-8B578620CD35}\\mpengine.dll+6a4fdd|C:\\ProgramData\\Microsoft\\Windows Defender\\Definition Upda

## Create a Spark RDD on top of Elasticsearch (logs-endpoint-winevent-security-* as source)

In [19]:
es_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf={ 
        "es.resource" : "logs-endpoint-winevent-security-*/doc",
        "es.nodes" : "172.18.0.2"
    })
es_rdd.first()

(u'2852068011',
 {u'@timestamp': u'2018-02-21T00:08:12.345Z',
  u'@version': u'1',
  u'beat': {u'hostname': u'DESKTOP-29DJI4T',
   u'name': u'DESKTOP-29DJI4T',
   u'version': u'6.1.2'},
  u'computer_name': u'DESKTOP-29DJI4T',
  u'event_data': {},
  u'event_id': 4616,
  u'keywords': (u'Audit Success',),
  u'level': u'Information',
  u'log_name': u'Security',
  u'message': u'The system time was changed.\n\nSubject:\n\tSecurity ID:\t\tS-1-5-18\n\tAccount Name:\t\tDESKTOP-29DJI4T$\n\tAccount Domain:\t\tWORKGROUP\n\tLogon ID:\t\t0x3E7\n\nProcess Information:\n\tProcess ID:\t0x834\n\tName:\t\tC:\\Program Files\\VMware\\VMware Tools\\vmtoolsd.exe\n\nPrevious Time:\t\t\u200e2018\u200e-\u200e02\u200e-\u200e20T17:16:32.271066000Z\nNew Time:\t\t\u200e2018\u200e-\u200e02\u200e-\u200e21T00:08:12.117000000Z\n\nThis event is generated when the system time is changed. It is normal for the Windows Time Service, which runs with System privilege, to change the system time on a regular basis. Other system

## Import Graphframes package

In [11]:
from graphframes import *

In [5]:
# Create a Vertex DataFrame with unique ID column "id"
v = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



2

## Create a basic SparkSession

In [14]:
spark = SparkSession \
    .builder \
    .appName("HELK") \
    .config("es.read.field.as.array.include", "tags") \
    .config("es.nodes","172.18.0.2:9200") \
    .getOrCreate()

## Spark SQL Basic query (logs-endpoint-winevent-security-* as source)

In [15]:
df = spark.read.format("org.elasticsearch.spark.sql").load("logs-endpoint-winevent-security-*/doc")

In [16]:
df.printSchema()

root
 |-- @timestamp: timestamp (nullable = true)
 |-- @version: string (nullable = true)
 |-- activity_id: string (nullable = true)
 |-- beat: struct (nullable = true)
 |    |-- hostname: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- version: string (nullable = true)
 |-- computer_name: string (nullable = true)
 |-- destination: struct (nullable = true)
 |    |-- port: struct (nullable = true)
 |    |    |-- number: integer (nullable = true)
 |-- event_data: struct (nullable = true)
 |-- event_id: long (nullable = true)
 |-- impersonationlevel: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- level: string (nullable = true)
 |-- log_name: string (nullable = true)
 |-- logon: struct (nullable = true)
 |    |-- authenticationpackage: string (nullable = true)
 |    |-- elevatedtoken: string (nullable = true)
 |    |-- keylength: string (nullable = true)
 |    |-- packagename: string (nullable = true)
 |    |-- privilegesassigned: string 

In [18]:
df.select("task").show()

+--------------------+
|                task|
+--------------------+
|Security State Ch...|
|       Special Logon|
|               Logon|
+--------------------+

