In [2]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *



#Reading data.

def parsingData(inputPath,inputPath2, spark):
    df3 = spark.read.options(header='True').csv(inputPath)
    df2 = spark.read.options(header='True').csv(inputPath2)
    df = df3.union(df2)
       
    df.drop('Flow ID', ' Protocol',' Source IP',' Source Port',' Destination IP', ' Destination Port')
    cols = []
    cols = df.columns
    
    del cols[-1]
    for coln in cols:
        df = df.withColumn(coln,col(coln).cast("double"))  
    df = df[" Flow Duration"," Flow IAT Mean"," Flow Packets/s","Flow Bytes/s","Total Length of Fwd Packets", " Label"]
    
    df = df.withColumnRenamed(" Flow Duration", "FD")\
           .withColumnRenamed(" Flow IAT Mean", " FIAM")\
           .withColumnRenamed(" Flow Packets/s", "FP")\
           .withColumnRenamed("Flow Bytes/s", "FB")\
           .withColumnRenamed("Total Length of Fwd Packets", "TLFP")\
           .withColumnRenamed(" Label", "Label")\
    
    return df

def init():
    spark = SparkSession.builder.appName('AndroidMalwareML').master("local").getOrCreate()

    return spark

sc = init()
data = parsingData("hdfs://namenode:9000/data/Ransomware/*.csv","hdfs://namenode:9000/data/Adware/*.csv", sc)
data.printSchema()

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="Label", outputCol="indexedLabel").fit(data)
col = data.columns
del col[-1]
vecta = VectorAssembler(inputCols=col,outputCol="features")
dataTemp = vecta.transform(data)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=5).fit(dataTemp)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = dataTemp.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test accuracy = %g " % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))
model.transform(dataTemp)
treeModel = model.stages[2]
# summary only
print(treeModel)
treeModel.write().overwrite().save("hdfs://namenode:9000/user/jovyan/model")

root
 |-- FD: double (nullable = true)
 |--  FIAM: double (nullable = true)
 |-- FP: double (nullable = true)
 |-- FB: double (nullable = true)
 |-- TLFP: double (nullable = true)
 |-- Label: string (nullable = true)



                                                                                

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         1.0|[2.0,2.0,1000000....|
|       0.0|         1.0|[3.0,3.0,666666.6...|
|       0.0|         1.0|[3.0,3.0,666666.6...|
|       0.0|         1.0|[4.0,4.0,500000.0...|
|       0.0|         1.0|[4.0,4.0,500000.0...|
+----------+------------+--------------------+
only showing top 5 rows



                                                                                

Test accuracy = 0.644109 
Test Error = 0.355891 
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d045584555e7, depth=5, numNodes=33, numClasses=2, numFeatures=5


                                                                                