In [1]:
# 用于验证使用平台读取zeek解析的log文件并进行处理的demo
# 创建SparkSession会话，并连接es
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import os

spark = SparkSession.builder \
    .appName("NetFlowReader") \
    .enableHiveSupport() \
    .getOrCreate() \

es_reader = (spark.read
    .format("org.elasticsearch.spark.sql")
    .option("inferSchema", "true")
    .option("es.nodes","elksj-elasticsearch:9200")
)

In [2]:
# 导入zat,将log转换为Dataframe
import zat
from zat import log_to_sparkdf

# 打印版本
print('ZAT: {:s}'.format(zat.__version__))
print('Pandas: {:s}'.format(pd.__version__))

ZAT: 0.4.4
Pandas: 1.0.4


In [15]:
# 读取es中保存的zeeklog（该log由logstash通过kafka向zeek获取后存至es）
zeeklogDF = es_reader.load("zeek-2022.03.21")
zeeklogDF.show()

+--------------------+--------+------------+----+----+-----+----+-----------+-----------+--------------+--------------------+--------+------+----+-----+----+
|          @timestamp|@version|capture_loss|conn| dns|files|http|known_certs|known_hosts|known_services|      loaded_scripts|log_type|notice| ssl|stats|x509|
+--------------------+--------+------------+----+----+-----+----+-----------+-----------+--------------+--------------------+--------+------+----+-----+----+
|2022-03-21 04:08:...|       1|        null|null|null| null|null|       null|       null|          null|[        /usr/loc...|    zeek|  null|null| null|null|
|2022-03-21 04:08:...|       1|        null|null|null| null|null|       null|       null|          null|[        /usr/loc...|    zeek|  null|null| null|null|
|2022-03-21 04:08:...|       1|        null|null|null| null|null|       null|       null|          null|[        /usr/loc...|    zeek|  null|null| null|null|
|2022-03-21 04:08:...|       1|        null|null|nul

In [16]:
# 筛选出特定字段
features = ['@timestamp', '@version', 'log_type', 'loaded_scripts']
zeeklogDF[features].show()

+--------------------+--------+--------+--------------------+
|          @timestamp|@version|log_type|      loaded_scripts|
+--------------------+--------+--------+--------------------+
|2022-03-21 04:08:...|       1|    zeek|[        /usr/loc...|
|2022-03-21 04:08:...|       1|    zeek|[        /usr/loc...|
|2022-03-21 04:08:...|       1|    zeek|[        /usr/loc...|
|2022-03-21 04:08:...|       1|    zeek|[        /usr/loc...|
|2022-03-21 04:08:...|       1|    zeek|[          /usr/l...|
|2022-03-21 04:08:...|       1|    zeek|[            /usr...|
|2022-03-21 04:08:...|       1|    zeek|[              /u...|
|2022-03-21 04:08:...|       1|    zeek|[    /usr/local/z...|
|2022-03-21 04:08:...|       1|    zeek|[    /usr/local/z...|
|2022-03-21 04:08:...|       1|    zeek|[  /usr/local/zee...|
|2022-03-21 04:08:...|       1|    zeek|[    /usr/local/z...|
|2022-03-21 04:08:...|       1|    zeek|[  /usr/local/zee...|
|2022-03-21 04:08:...|       1|    zeek|[    /usr/local/z...|
|2022-03

In [17]:
zeekpdDF = zeeklogDF[features].toPandas()
zeekpdDF.head()

Unnamed: 0,@timestamp,@version,log_type,loaded_scripts
0,2022-03-21 04:08:46.135,1,zeek,( /usr/local/zeek/share/zeek/base/proto...
1,2022-03-21 04:08:46.135,1,zeek,( /usr/local/zeek/share/zeek/base/proto...
2,2022-03-21 04:08:46.136,1,zeek,( /usr/local/zeek/share/zeek/base/proto...
3,2022-03-21 04:08:46.136,1,zeek,( /usr/local/zeek/share/zeek/base/proto...
4,2022-03-21 04:08:46.136,1,zeek,( /usr/local/zeek/share/zeek/base/fil...


In [18]:
newDF = spark.createDataFrame(zeekpdDF)
newDF.head()

Row(@timestamp=datetime.datetime(2022, 3, 21, 4, 8, 46, 135000), @version='1', log_type='zeek', loaded_scripts=Row(name='        /usr/local/zeek/share/zeek/base/protocols/ssl/main.zeek'))

In [19]:
# Store to elasticsearch把Spark DF存到es
(newDF.write.format('org.elasticsearch.spark.sql')
              .option("es.nodes","elksj-elasticsearch:9200")
              .option('es.resource', '%s/%s' % ('simplify_zeeklog', 'newDF'))
              .save())