In [1]:
!pip install neo4j



In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType
from pyspark.sql.functions import from_json
from pyspark.sql.functions import from_csv
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from neo4j import GraphDatabase

In [11]:
# Spark session & context
def init():
    spark = (SparkSession
         .builder
         .master('local')
         .appName('AndMalware-consumer')
         # Add kafka package  
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1")\
         .getOrCreate())
    return spark

In [12]:
sc = init()
# Create stream dataframe setting kafka server, topic and offset option
def getReadStream(spark):
    df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", "kafka-server:9092") \
      .option("startingOffsets", "earliest") \
      .option("subscribe", "AndMalwer") \
      .load()
    
    #df.selectExpr("CAST(value AS STRING)", "timestamp")
    dff = (df
    .withColumn("key", df["key"].cast(StringType()))
    .withColumn("value", df["value"].cast(StringType())))
    return dff


In [18]:
def foreach_batch_function(df, epoch_id):
    # Transform and write batchDF
    df.write\
    .format("org.neo4j.spark.DataSource")\
    .mode("Append")\
    .option("url", "bolt://neo4j:7687")\
    .option("authentication.type", "basic")\
    .option("authentication.basic.username", "neo4j")\
    .option("authentication.basic.password", "neo")\
    .option("query", "CREATE (n:Mala {maxFD: event.MaxFD})")\
    .save()
    pass

In [None]:
df1 = getReadStream(sc)

df2 = df1.selectExpr("split(value,',')[0] as Source_IP" \
                 ,"split(value,',')[1] as Source_Port" \
                 ,"split(value,',')[2] as Destination_IP" \
                ,"split(value,',')[3] as Destination_Port" \
                ,"split(value,',')[4] as Timestamp" \
                ,"split(value,',')[5] as Flow_Duration" \
                ,"split(value,',')[6] as Total_Fwd_Packets" \
                ,"split(value,',')[7] as Total_Bwd_Packets" \
                ,"split(value,',')[8] as Total_Length_of_Fwd_Packets" \
                ,"split(value,',')[9] as Total_Length_of_Bwd_Packets" \
                ,"split(value,',')[10] as Flow_Bytess" \
                ,"split(value,',')[11] as Flow_Packetss" \
                    )



df3 = df2.withColumn("Source_Port", df2["Source_Port"].cast(IntegerType()))\
        .withColumn("Destination_Port", df2["Destination_Port"].cast(IntegerType()))\
        .withColumn("Destination_IP", df2["Destination_IP"].alias("DIP"))\
        .withColumn("Timestamp", df2["Timestamp"].cast(TimestampType()))\
        .withColumn("Flow_Duration", df2["Flow_Duration"].cast(IntegerType()))\
        .withColumn("Total_Fwd_Packets", df2["Total_Fwd_Packets"].cast(IntegerType()))\
        .withColumn("Total_Bwd_Packets", df2["Total_Bwd_Packets"].cast(IntegerType()))\
        .withColumn("Total_Length_of_Fwd_Packets", df2["Total_Length_of_Fwd_Packets"].cast(IntegerType()))\
        .withColumn("Total_Length_of_Bwd_Packets", df2["Total_Length_of_Bwd_Packets"].cast(IntegerType()))\
        .withColumn("Flow_Bytess", df2["Flow_Bytess"].cast(IntegerType()))\
        .withColumn("Flow_Packetss", df2["Flow_Packetss"].cast(DoubleType()))

wdf = df3.groupBy(window(df3.Timestamp,"10 minutes")).agg(sum(col("Flow_Duration")).alias("MaxFD"))


query = (wdf.writeStream\
        .foreachBatch(foreach_batch_function)\
        .outputMode('update')\
        .trigger(processingTime='3 seconds')\
        .start())

query.awaitTermination()
