# HELK: Checking Spark and Graphframe Integrations
----------------------------------------------------------------------------
## Goals:
* Test if Jupyter can talk to Spark & Graphframes
* Test if Spark & Graphframes can pull data from ES
* Show the basics of the HELK integrations with advanced analytics libraries

## Check the Spark Context via the variable sc 

In [1]:
sc

## Create a Spark RDD on top of Elasticsearch (logs-endpoint-winevent-sysmon-* as source)

In [2]:
es_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf={ "es.resource" : "logs-endpoint-winevent-sysmon-*/doc" })
es_rdd.first()

(u'EyaM12ABGZB0cH7uy-kS',
 {u'@timestamp': u'2018-01-08T20:19:23.195Z',
  u'@version': u'1',
  u'beat': {u'hostname': u'WD-HR001',
   u'name': u'WD-HR001',
   u'version': u'6.0.0'},
  u'computer_name': u'WD-HR001.wardog.com',
  u'event': {u'creationtime': {u'utc': u'2018-01-08 20:19:22.978'}},
  u'event_id': 9,
  u'host': u'WD-HR001',
  u'level': u'Information',
  u'log_name': u'Microsoft-Windows-Sysmon/Operational',
  u'opcode': u'Info',
  u'process': {u'guid': u'{DBA5A4A0-2F96-5A50-0000-00106D560100}',
   u'id': 1428,
   u'name': u'C:\\Windows\\System32\\svchost.exe'},
  u'process_id': 2216,
  u'provider_guid': u'{5770385F-C22A-43E0-BF4C-06F5698FFBD9}',
  u'rawaccess': {u'read': {u'device': u'\\Device\\HarddiskVolume2'}},
  u'record_number': u'1006036',
  u'source_name': u'Microsoft-Windows-Sysmon',
  u'subject': {u'user': {u'domain': u'NT AUTHORITY',
    u'name': u'SYSTEM',
    u'sid': u'S-1-5-18'}},
  u'tags': (u'beats_input_codec_plain_applied', u'_grokparsefailure'),
  u'task': u

## Create a Spark RDD on top of Elasticsearch (logs-endpoint-winevent-security-* as source)

In [3]:
es_rdd = sc.newAPIHadoopRDD(
    inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf={ "es.resource" : "logs-endpoint-winevent-security-*/doc" })
es_rdd.first()

(u'tiZ912ABGZB0cH7uLt6P',
 {u'@timestamp': u'2018-01-08T20:31:24.258Z',
  u'@version': u'1',
  u'beat': {u'hostname': u'WD-DC001',
   u'name': u'WD-DC001',
   u'version': u'6.1.1'},
  u'computer_name': u'WD-DC001.wardog.com',
  u'event_data': {},
  u'event_id': 4703,
  u'host': u'WD-DC001',
  u'keywords': (u'Audit Success',),
  u'level': u'Information',
  u'log_name': u'Security',
  u'message': u'A token right was adjusted.\n\nSubject:\n\tSecurity ID:\t\tS-1-5-18\n\tAccount Name:\t\tWD-DC001$\n\tAccount Domain:\t\tWARDOG\n\tLogon ID:\t\t0x3E7\n\nTarget Account:\n\tSecurity ID:\t\tS-1-0-0\n\tAccount Name:\t\tWD-DC001$\n\tAccount Domain:\t\tWARDOG\n\tLogon ID:\t\t0x3E7\n\nProcess Information:\n\tProcess ID:\t\t0xe8\n\tProcess Name:\t\tC:\\Windows\\System32\\svchost.exe\n\nEnabled Privileges:\n\t\t\tSeAssignPrimaryTokenPrivilege\n\t\t\tSeIncreaseQuotaPrivilege\n\t\t\tSeSecurityPrivilege\n\t\t\tSeTakeOwnershipPrivilege\n\t\t\tSeLoadDriverPrivilege\n\t\t\tSeSystemtimePrivilege\n\t\t\tSeBack

## Import Graphframes package

In [4]:
from graphframes import *

In [5]:
# Create a Vertex DataFrame with unique ID column "id"
v = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



2

## Create a basic SparkSession

In [6]:
spark = SparkSession \
    .builder \
    .appName("HELK") \
    .config("es.read.field.as.array.include", "tags") \
    .getOrCreate()

## Spark SQL Basic query (logs-endpoint-winevent-security-* as source)

In [7]:
df = spark.read.format("org.elasticsearch.spark.sql").load("logs-endpoint-winevent-security-*/doc")

In [8]:
df.printSchema()

root
 |-- @timestamp: timestamp (nullable = true)
 |-- @version: string (nullable = true)
 |-- activity_id: string (nullable = true)
 |-- beat: struct (nullable = true)
 |    |-- hostname: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- version: string (nullable = true)
 |-- computer_name: string (nullable = true)
 |-- destination: struct (nullable = true)
 |    |-- hostnameid: string (nullable = true)
 |    |-- ip: string (nullable = true)
 |    |-- port: struct (nullable = true)
 |    |    |-- number: integer (nullable = true)
 |    |-- userid: string (nullable = true)
 |-- event_data: struct (nullable = true)
 |    |-- ActiveProfile: string (nullable = true)
 |-- event_id: long (nullable = true)
 |-- externaldevice: struct (nullable = true)
 |    |-- classid: string (nullable = true)
 |    |-- classname: string (nullable = true)
 |    |-- compatibleids: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- id: string (nullable

In [9]:
df.select("task").show()

+--------------------+
|                task|
+--------------------+
|Token Right Adjus...|
|Filtering Platfor...|
|Filtering Platfor...|
|Filtering Platfor...|
|Filtering Platfor...|
|Filtering Platfor...|
|Token Right Adjus...|
|Filtering Platfor...|
|Token Right Adjus...|
|Token Right Adjus...|
|Token Right Adjus...|
|Token Right Adjus...|
|Token Right Adjus...|
|Token Right Adjus...|
|Token Right Adjus...|
|Filtering Platfor...|
|Filtering Platfor...|
|Filtering Platfor...|
|Token Right Adjus...|
|Token Right Adjus...|
+--------------------+
only showing top 20 rows

