In [1]:
using CSV
using DataFrames
using Random

include("functions.jl");


In [2]:
using ScikitLearn

@sk_import svm:SVC;
@sk_import tree:DecisionTreeClassifier;
@sk_import ensemble:VotingClassifier
@sk_import neighbors: KNeighborsClassifier;


## Read Data

In [4]:
file_path = "datasets/super_simplified_Android_Malware.csv"

data = CSV.File(file_path, header=true) |> DataFrame;

In [5]:
describe(data, :all)

Row,variable,mean,std,min,q25,median,q75,max,sum,nunique,nuniqueall,nmissing,nnonmissing,first,last,eltype
Unnamed: 0_level_1,Symbol,Union…,Union…,Any,Union…,Union…,Union…,Any,Union…,Union…,Int64,Int64,Int64,Any,Any,DataType
1,Flow ID,,,1.31.173.21-10.42.0.151-80-36854-6,,,,8.8.8.8-10.42.0.211-53-3181-17,,3502,3502,0,3557,157.240.0.36-10.42.0.211-443-55364-6,10.42.0.211-10.42.0.1-3890-53-17,String
2,Source IP,,,0.0.0.0,,,,96.6.164.184,,246,246,0,3557,10.42.0.211,10.42.0.211,String15
3,Source Port,39310.9,17860.1,0,34948.0,43520.0,52110.0,65400,139828814,,3013,0,3557,55364,3890,Int64
4,Destination IP,,,1.31.173.21,,,,98.139.225.43,,826,826,0,3557,157.240.0.36,10.42.0.1,String15
5,Destination Port,5390.09,14590.5,0.0,80.0,443.0,443.0,60729.0,1.91725e7,,400,0,3557,443.0,53.0,Float64
6,Protocol,8.26061,4.51586,0.0,6.0,6.0,6.0,17.0,29383.0,,3,0,3557,6.0,17.0,Float64
7,Timestamp,,,04/07/2017 10:08:16,,,,30/06/2017 12:59:10,,3332,3332,0,3557,26/06/2017 12:43:14,13/06/2017 08:39:01,String31
8,Flow Duration,1.11761e7,2.19812e7,2,48777.0,557126.0,1.08321e7,119977227,39753215647,,3442,0,3557,65319091,48681,Int64
9,Total Fwd Packets,7.5932,59.0075,1,1.0,2.0,5.0,3246,27009,,97,0,3557,9,1,Int64
10,Total Backward Packets,11.0748,159.416,0,0.0,1.0,4.0,8452,39393,,117,0,3557,9,1,Int64


In [6]:
import StatsBase: countmap

columns_to_drop = ["Flow ID", " Timestamp"]
columns = names(data)

println("Size of dataframe before dropping columns $(size(data))")
for column in 1:size(data, 2)
    unique_values = countmap(data[:, column])

    if length(unique_values) == 1
        println("Adding column $(columns[column])")
        # println(unique_values)
        push!(columns_to_drop, columns[column])
    end
    
end

select!(data, Not(columns_to_drop))

println("Size of dataframe after dropping columns $(size(data))")

dropmissing!(data)

println("Size of dataframe after dropping nulls $(size(data))")

unique_data = unique(data)

println("Size of dataframe after dropping duplicating rows $(size(data))")

Size of dataframe before dropping columns (3557, 85)
Adding column  Bwd PSH Flags
Adding column  Fwd URG Flags
Adding column  Bwd URG Flags
Adding column  RST Flag Count
Adding column  CWE Flag Count
Adding column  ECE Flag Count
Adding column Fwd Avg Bytes/Bulk
Adding column  Fwd Avg Packets/Bulk
Adding column  Fwd Avg Bulk Rate
Adding column  Bwd Avg Bytes/Bulk
Adding column  Bwd Avg Packets/Bulk
Adding column Bwd Avg Bulk Rate
Size of dataframe after dropping columns (3557, 71)
Size of dataframe after dropping nulls (3557, 71)
Size of dataframe after dropping duplicating rows (3557, 71)


In [7]:
countmap(data[:, :Label])

Dict{String31, Int64} with 4 entries:
  "Benign"              => 237
  "Android_Scareware"   => 1171
  "Android_SMS_Malware" => 674
  "Android_Adware"      => 1475

In [8]:
function ip_to_decimal(ip)
    # Split the IP address into octets
    octets = split(ip, '.')
    # Convert each octet to binary and combine them into a single 32-bit number
    binary = join([string(parse(Int, octet, base=10), base=2, pad=8) for octet in octets])
    decimal = parse(Int, binary, base=2) # Convert binary to decimal
    return decimal
end

source_ips = data[!, :" Source IP"];
destination_ips = data[!, :" Destination IP"];

data[!, :"Source IP Decimal"] = map(ip -> ip_to_decimal(ip), source_ips);
data[!, :"Destination IP Decimal"] = map(ip -> ip_to_decimal(ip), destination_ips);

select!(data, Not([" Source IP", " Destination IP"]));

In [9]:
describe(data)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,Source Port,39310.9,0,43520.0,65400,0,Int64
2,Destination Port,5390.09,0.0,443.0,60729.0,0,Float64
3,Protocol,8.26061,0.0,6.0,17.0,0,Float64
4,Flow Duration,1.11761e7,2,557126.0,119977227,0,Int64
5,Total Fwd Packets,7.5932,1,2.0,3246,0,Int64
6,Total Backward Packets,11.0748,0,1.0,8452,0,Int64
7,Total Length of Fwd Packets,665.05,0.0,31.0,110678.0,0,Float64
8,Total Length of Bwd Packets,12084.5,0.0,17.0,1.22247e7,0,Float64
9,Fwd Packet Length Max,218.112,0.0,31.0,1460.0,0,Float64
10,Fwd Packet Length Min,11.5516,0.0,0.0,1460.0,0,Float64


In [10]:
output_data = data[!, :Label];
select!(data, Not(:Label))
input_data = Matrix(data[!, 1:size(data, 2)]);

In [11]:
size(input_data)

(3557, 70)

## First approach: binary classification

In [12]:
function transform_binary_class(output_data)
    binary_labels = output_data .!= "Benign"
    return binary_labels
end

binary_labels = transform_binary_class(output_data)
@assert binary_labels isa BitVector
@assert input_data isa Matrix

In [13]:
countmap(binary_labels)

Dict{Bool, Int64} with 2 entries:
  0 => 237
  1 => 3320

### Preprocessing

In [14]:
Random.seed!(42)

train_indexes, test_indexes = holdOut(size(input_data, 1), 0.2)

train_input = convert(Array{Float32, 2}, input_data[train_indexes, :])
train_binary_output = binary_labels[train_indexes]

normalizationParameters = calculateMinMaxNormalizationParameters(train_input)

normalizeMinMax!(train_input, normalizationParameters)

test_input = convert(Array{Float32, 2}, input_data[test_indexes, :])
test_binary_output = binary_labels[test_indexes]

normalizeMinMax!(test_input, normalizationParameters)

@assert size(test_input, 1) == size(test_binary_output, 1)
@assert size(train_input, 1) == size(train_binary_output, 1)

In [15]:
Random.seed!(42)

kFolds = 10
crossValidationIndexes = crossvalidation(train_binary_output, kFolds);

In [16]:
function generate_latex_table(metrics::Dict{String, <: Any}, final::Bool)
    
    topology = metrics["topology"]
    accuracy = metrics["accuracy"]
    recall = metrics["recall"]
    specificity = metrics["specificity"]
    f1_score = metrics["f1_score"]
    
    if final
        confusion_matrix = metrics["confusion_matrix"]
        println("$topology & $(round(accuracy*100, digits=2))\\%  & $(round(recall*100, digits=2))\\%  & $(round(specificity*100, digits=2))\\%  & $(round(f1_score*100, digits=2))\\% & $confusion_matrix \\\\")
    else
        std_accuracy = metrics["std_accuracy"]
        std_recall = metrics["std_recall"]
        std_specificity = metrics["std_specificity"]
        std_f1_score = metrics["std_f1_score"]
        println("$topology & $(round(accuracy*100, digits=2))\\% \\textit{($(round(std_accuracy, digits = 2)))} & $(round(recall*100, digits=2))\\% \\textit{($(round(std_recall, digits = 2)))} & $(round(specificity*100, digits=2))\\% \\textit{($(round(std_specificity, digits = 2)))} & $(round(f1_score*100, digits=2))\\% \\textit{($(round(std_f1_score, digits = 2)))} \\\\")
    end
    
end

generate_latex_table (generic function with 1 method)

### kNN

In [17]:
include("functions.jl")
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 0)

ks = [3 , 5, 7, 10, 15, 20]
for k in ks
    knnParameters["numNeighboors"] = k
    metricsCV = (modelCrossValidation(knnParameters["modelType"], knnParameters, train_input, train_binary_output, crossValidationIndexes))
    metricsCV["topology"] = k

    generate_latex_table(metricsCV, false)
end

println("----------------------------------------------------------------")
for k in ks
    knnParameters["numNeighboors"] = k
    metrics = createAndTrainFinalModel(knnParameters["modelType"], knnParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = k

    generate_latex_table(metrics, true)
end

3 & 92.79\% \textit{(0.01)} & 98.95\% \textit{(0.01)} & 1.14\% \textit{(0.02)} & 96.26\% \textit{(0.0)} \\
5 & 93.43\% \textit{(0.0)} & 99.7\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.6\% \textit{(0.0)} \\
7 & 93.57\% \textit{(0.0)} & 99.85\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.68\% \textit{(0.0)} \\
10 & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
15 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
20 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
----------------------------------------------------------------
3 & 90.03\%  & 97.86\%  & 1.72\%  & 94.74\% & [640 57; 14 1] \\
5 & 91.43\%  & 99.54\%  & 0.0\%  & 95.52\% & [651 58; 3 0] \\
7 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
10 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
15 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \

### Decision Tree

In [18]:
include("functions.jl")
dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 1)

depths = [3, 5, 7, 10, 15, typemax(Int)]
for depth in depths
    dtParameters["maxDepth"] = depth
    metricsCV = (modelCrossValidation(dtParameters["modelType"], dtParameters, train_input, train_binary_output, crossValidationIndexes))
    metricsCV["topology"] = depth

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for depth in depths
    dtParameters["maxDepth"] = depth
    metrics = createAndTrainFinalModel(dtParameters["modelType"], dtParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = depth

    generate_latex_table(metrics, true)

end


3 & 93.22\% \textit{(0.01)} & 99.47\% \textit{(0.01)} & 0.0\% \textit{(0.0)} & 96.49\% \textit{(0.0)} \\
5 & 92.9\% \textit{(0.01)} & 99.06\% \textit{(0.01)} & 1.11\% \textit{(0.02)} & 96.32\% \textit{(0.0)} \\
7 & 92.02\% \textit{(0.01)} & 98.09\% \textit{(0.01)} & 1.67\% \textit{(0.03)} & 95.84\% \textit{(0.01)} \\
10 & 90.65\% \textit{(0.01)} & 96.51\% \textit{(0.02)} & 3.37\% \textit{(0.04)} & 95.08\% \textit{(0.01)} \\
15 & 88.19\% \textit{(0.03)} & 93.81\% \textit{(0.03)} & 4.51\% \textit{(0.05)} & 93.69\% \textit{(0.01)} \\
9223372036854775807 & 86.79\% \textit{(0.02)} & 92.24\% \textit{(0.02)} & 5.59\% \textit{(0.06)} & 92.89\% \textit{(0.01)} \\
----------------------------------------------------------------
3 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
5 & 91.71\%  & 99.85\%  & 0.0\%  & 95.68\% & [653 58; 1 0] \\
7 & 89.61\%  & 97.55\%  & 0.0\%  & 94.52\% & [638 58; 16 0] \\
10 & 88.06\%  & 95.72\%  & 1.72\%  & 93.64\% & [626 57; 28 1] \\
15 & 85.25\%  & 92.6

### SVM

In [19]:
include("functions.jl")
svmParameters = Dict("modelType" => :SVM, "C" => 1, "kernel" => "linear", "degree" => 3, "gamma" => "scale")

svms = [
    ("rbf", 0.1),
    ("rbf", 1.0),
    ("rbf", 10.0),
    ("poly", 0.1),
    ("poly", 1.0),
    ("linear", 0.1),
    ("linear", 1.0),
    ("linear", 10.0),
]

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metricsCV = (modelCrossValidation(svmParameters["modelType"], svmParameters, train_input, train_binary_output, crossValidationIndexes))
    metricsCV["topology"] = kernel * string(C)

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metrics = createAndTrainFinalModel(svmParameters["modelType"], svmParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = kernel * string(C)

    generate_latex_table(metrics, true)

end


rbf0.1 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
rbf1.0 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
rbf10.0 & 93.64\% \textit{(0.0)} & 99.93\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.71\% \textit{(0.0)} \\
poly0.1 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
poly1.0 & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
linear0.1 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
linear1.0 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
linear10.0 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
----------------------------------------------------------------
rbf0.1 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58;

### ANN

In [15]:
include("functions.jl")

topologies = [[20], [40], [80], [100], [60, 120], [80, 50], [80, 100], [100, 40]]
annParameters = Dict("modelType" => :ANN, "maxEpochs" => 200,
    "learningRate" => 0.01, "maxEpochsVal" => 30,
    "repetitions" => 30, "validationRatio" => 0.1,
    "transferFunctions" => fill(σ, 2))

for topology in topologies
    annParameters["topology"] = topology
    metricsCV = modelCrossValidation(annParameters["modelType"], annParameters, train_input, train_binary_output, crossValidationIndexes)
    metricsCV["topology"] = topology 

    generate_latex_table(metricsCV, false)
end

for topology in topologies
    annParameters["topology"] = topology
    metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = topology 

    generate_latex_table(metrics, true)
end

[60, 120] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[80, 50] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[80, 100] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[100, 40] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[60, 120] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
[80, 50] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
[80, 100] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
[100, 40] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\


In [37]:
include("functions.jl")

annParameters = Dict("modelType" => :ANN, "maxEpochs" => 1000,
    "learningRate" => 0.01, "maxEpochsVal" => 30,
    "repetitions" => 30, "validationRatio" => 0.1,
    "transferFunctions" => fill(σ, 2), "topology" => [100, 40])
metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_binary_output, test_input, test_binary_output)

(712, 1)
(2845, 1)
(2560, 1)(712, 1)
(712, 1)


Dict{String, Any} with 8 entries:
  "errorRate"                 => 0.0814607
  "f1_score"                  => 0.95754
  "specificity"               => 0.0
  "negative_predictive_value" => 0.0
  "confusion_matrix"          => [654 58; 0 0]
  "accuracy"                  => 0.918539
  "recall"                    => 1.0
  "precision"                 => 0.918539

### Ensembles

In [19]:
@sk_import ensemble:StackingClassifier
@sk_import ensemble:VotingClassifier




PyObject <class 'sklearn.ensemble._voting.VotingClassifier'>

In [20]:
include("functions.jl")

dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 5)
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 3)
svmParameters = Dict("modelType" => :SVM, "kernel" => "rbf", "C" => 10)
Random.seed!(42)

ensemble_types = [:VotingHard, :Stacking]
final_estimators = [dtParameters, knnParameters, svmParameters]

for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metricsCV = trainClassEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_binary_output), crossValidationIndexes; ensembleType = ensemble_type, final_estimator = final_estimator)
        metricsCV["topology"] = final_estimator
        generate_latex_table(metricsCV, false)

    end
end


for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metrics = createAndTrainFinalEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_binary_output), (test_input, test_binary_output); ensembleType = ensemble_type, final_estimator = final_estimator)
        metrics["topology"] = final_estimator
        generate_latex_table(metrics, true)
    end
end


Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 5) & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :kNN, "numNeighboors" => 3) & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :SVM, "C" => 10, "kernel" => "rbf") & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 5) & 93.04\% \textit{(0.01)} & 99.21\% \textit{(0.01)} & 1.11\% \textit{(0.02)} & 96.39\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :kNN, "numNeighboors" => 3) & 92.62\% \textit{(0.01)} & 98.8\% \textit{(0.01)} & 0.56\% \textit{(0.02)} & 96.16\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :SVM, "C" => 10, "kernel" => "rbf") & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)

## Second Approach

### Data balancing

In [20]:
using MLDataPattern;
Random.seed!(42)
X_bal, y_bal = oversample((input_data', binary_labels), shuffle = true)
X_bal = getobs(X_bal)'
y_bal = getobs(y_bal)

6640-element BitVector:
 1
 1
 1
 0
 0
 0
 1
 1
 1
 1
 1
 0
 0
 ⋮
 1
 0
 0
 0
 0
 1
 1
 1
 0
 1
 1
 0

In [21]:
countmap(y_bal)

Dict{Bool, Int64} with 2 entries:
  0 => 3320
  1 => 3320

In [34]:
Random.seed!(42)

train_indexes, test_indexes = holdOut(size(X_bal, 1), 0.2)

train_input = convert(Array{Float32, 2}, X_bal[train_indexes, :])
train_balanced_binary_output = y_bal[train_indexes]

normalizationParameters = calculateMinMaxNormalizationParameters(train_input)

normalizeMinMax!(train_input, normalizationParameters)

test_input = convert(Array{Float32, 2}, X_bal[test_indexes, :])
test_balanced_binary_output = y_bal[test_indexes]

normalizeMinMax!(test_input, normalizationParameters)

@assert size(test_input, 1) == size(test_balanced_binary_output, 1)
@assert size(train_input, 1) == size(train_balanced_binary_output, 1)

In [22]:
Random.seed!(42)

train_indexes, test_indexes = holdOut(size(X_bal, 1), 0.2)

train_input = convert(Array{Float32, 2}, X_bal[train_indexes, :])
train_balanced_binary_output = y_bal[train_indexes]

normalizationParameters = calculateZeroMeanNormalizationParameters(train_input)

normalizeZeroMean!(train_input, normalizationParameters)

test_input = convert(Array{Float32, 2}, X_bal[test_indexes, :])
test_balanced_binary_output = y_bal[test_indexes]

normalizeZeroMean!(test_input, normalizationParameters)

@assert size(test_input, 1) == size(test_balanced_binary_output, 1)
@assert size(train_input, 1) == size(train_balanced_binary_output, 1)

In [23]:
Random.seed!(42)

kFolds = 10
crossValidationIndexes = crossvalidation(train_balanced_binary_output, kFolds);

### kNN

In [24]:
include("functions.jl")
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 0)

ks = [3 , 5, 7, 10, 15, 20]
for k in ks
    knnParameters["numNeighboors"] = k
    metricsCV = (modelCrossValidation(knnParameters["modelType"], knnParameters, train_input, train_balanced_binary_output, crossValidationIndexes))
    metricsCV["topology"] = k

    generate_latex_table(metricsCV, false)
end

println("----------------------------------------------------------------")
for k in ks
    knnParameters["numNeighboors"] = k
    metrics = createAndTrainFinalModel(knnParameters["modelType"], knnParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = k

    generate_latex_table(metrics, true)
end

3 & 91.08\% \textit{(0.01)} & 82.21\% \textit{(0.02)} & 100.0\% \textit{(0.0)} & 90.22\% \textit{(0.01)} \\
5 & 87.35\% \textit{(0.02)} & 74.78\% \textit{(0.03)} & 100.0\% \textit{(0.0)} & 85.54\% \textit{(0.02)} \\
7 & 84.19\% \textit{(0.01)} & 68.47\% \textit{(0.03)} & 100.0\% \textit{(0.0)} & 81.25\% \textit{(0.02)} \\
10 & 77.94\% \textit{(0.02)} & 56.01\% \textit{(0.03)} & 100.0\% \textit{(0.0)} & 71.75\% \textit{(0.03)} \\
15 & 71.46\% \textit{(0.01)} & 50.08\% \textit{(0.02)} & 92.98\% \textit{(0.02)} & 63.74\% \textit{(0.02)} \\
20 & 65.38\% \textit{(0.02)} & 47.71\% \textit{(0.02)} & 83.16\% \textit{(0.03)} & 58.01\% \textit{(0.02)} \\
----------------------------------------------------------------
3 & 92.32\%  & 84.45\%  & 100.0\%  & 91.57\% & [554 0; 102 672] \\
5 & 89.38\%  & 78.51\%  & 100.0\%  & 87.96\% & [515 0; 141 672] \\
7 & 86.82\%  & 73.32\%  & 100.0\%  & 84.61\% & [481 0; 175 672] \\
10 & 80.72\%  & 60.98\%  & 100.0\%  & 75.76\% & [400 0; 256 672] \\
15 & 75.53\% 

### Decision Tree

In [25]:
include("functions.jl")
dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 1)

depths = [3, 5, 7, 10, 15, nothing]
for depth in depths
    dtParameters["maxDepth"] = depth
    metricsCV = (modelCrossValidation(dtParameters["modelType"], dtParameters, train_input, train_balanced_binary_output, crossValidationIndexes))
    metricsCV["topology"] = depth

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for depth in depths
    dtParameters["maxDepth"] = depth
    metrics = createAndTrainFinalModel(dtParameters["modelType"], dtParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = depth

    generate_latex_table(metrics, true)

end


3 & 57.83\% \textit{(0.03)} & 37.99\% \textit{(0.12)} & 77.8\% \textit{(0.1)} & 46.36\% \textit{(0.1)} \\
5 & 64.2\% \textit{(0.03)} & 49.59\% \textit{(0.14)} & 78.89\% \textit{(0.12)} & 57.0\% \textit{(0.09)} \\
7 & 71.61\% \textit{(0.03)} & 62.95\% \textit{(0.07)} & 80.33\% \textit{(0.07)} & 68.85\% \textit{(0.04)} \\
10 & 80.27\% \textit{(0.03)} & 66.67\% \textit{(0.07)} & 93.96\% \textit{(0.04)} & 77.03\% \textit{(0.04)} \\
15 & 90.14\% \textit{(0.02)} & 80.6\% \textit{(0.04)} & 99.74\% \textit{(0.0)} & 89.09\% \textit{(0.02)} \\
nothing & 95.09\% \textit{(0.01)} & 90.2\% \textit{(0.02)} & 100.0\% \textit{(0.0)} & 94.84\% \textit{(0.01)} \\
----------------------------------------------------------------
3 & 56.78\%  & 51.68\%  & 61.76\%  & 54.15\% & [339 257; 317 415] \\
5 & 62.42\%  & 42.84\%  & 81.55\%  & 52.97\% & [281 124; 375 548] \\
7 & 70.11\%  & 73.63\%  & 66.67\%  & 70.87\% & [483 224; 173 448] \\
10 & 79.89\%  & 60.67\%  & 98.66\%  & 74.88\% & [398 9; 258 663] \\
15 & 89

### SVM

In [None]:
include("functions.jl")
svmParameters = Dict("modelType" => :SVM, "C" => 1, "kernel" => "linear", "degree" => 3, "gamma" => "scale")

svms = [
    ("rbf", 0.1),
    ("rbf", 1.0),
    ("rbf", 10.0),
    ("poly", 0.1),
    ("poly", 1.0),
    ("linear", 0.1),
    ("linear", 1.0),
    ("linear", 10.0),
]

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metricsCV = (modelCrossValidation(svmParameters["modelType"], svmParameters, train_input, train_balanced_binary_output, crossValidationIndexes))
    metricsCV["topology"] = kernel * " & " * string(C)

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metrics = createAndTrainFinalModel(svmParameters["modelType"], svmParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = kernel * " & " * string(C)

    generate_latex_table(metrics, true)

end


rbf & 0.1 & 56.87\% \textit{(0.02)} & 64.72\% \textit{(0.02)} & 48.98\% \textit{(0.04)} & 60.09\% \textit{(0.01)} \\
rbf & 1.0 & 64.1\% \textit{(0.03)} & 59.05\% \textit{(0.04)} & 69.18\% \textit{(0.04)} & 62.22\% \textit{(0.03)} \\
rbf & 10.0 & 72.23\% \textit{(0.02)} & 66.22\% \textit{(0.03)} & 78.28\% \textit{(0.05)} & 70.53\% \textit{(0.02)} \\
poly & 0.1 & 56.64\% \textit{(0.02)} & 92.42\% \textit{(0.02)} & 20.65\% \textit{(0.03)} & 68.14\% \textit{(0.01)} \\
poly & 1.0 & 62.54\% \textit{(0.01)} & 76.54\% \textit{(0.02)} & 48.45\% \textit{(0.04)} & 67.2\% \textit{(0.01)} \\
linear & 0.1 & 57.29\% \textit{(0.02)} & 64.53\% \textit{(0.03)} & 50.0\% \textit{(0.04)} & 60.24\% \textit{(0.02)} \\
linear & 1.0 & 58.0\% \textit{(0.02)} & 64.49\% \textit{(0.02)} & 51.47\% \textit{(0.05)} & 60.64\% \textit{(0.02)} \\
linear & 10.0 & 58.23\% \textit{(0.02)} & 64.72\% \textit{(0.03)} & 51.7\% \textit{(0.05)} & 60.84\% \textit{(0.02)} \\
--------------------------------------------------------

### ANN

In [18]:
include("functions.jl")

topologies = [[20], [40], [80], [100], [60, 120], [80, 50], [80, 100], [100, 40]]
annParameters = Dict("modelType" => :ANN, "maxEpochs" => 200,
    "learningRate" => 0.01, "maxEpochsVal" => 30,
    "repetitions" => 30, "validationRatio" => 0.1,
    "transferFunctions" => fill(σ, 2))

for topology in topologies
    annParameters["topology"] = topology
    metricsCV = modelCrossValidation(annParameters["modelType"], annParameters, train_input, train_balanced_binary_output, crossValidationIndexes)
    metricsCV["topology"] = topology 

    generate_latex_table(metricsCV, false)
end

for topology in topologies
    annParameters["topology"] = topology
    metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = topology 

    generate_latex_table(metrics, true)
end

[20] & 60.11\% \textit{(0.02)} & 59.0\% \textit{(0.03)} & 61.23\% \textit{(0.02)} & 59.34\% \textit{(0.02)} \\
[40] & 57.89\% \textit{(0.01)} & 58.06\% \textit{(0.01)} & 57.71\% \textit{(0.04)} & 56.52\% \textit{(0.01)} \\


In [103]:
metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)

LoadError: AssertionError: size(trainingInputs, 1) == length(trainingTargets)

### Ensembles

In [115]:
include("functions.jl")

dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => typemax(Int))
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 3)
svmParameters = Dict("modelType" => :SVM, "kernel" => "rbf", "C" => 10)
Random.seed!(42)

ensemble_types = [:VotingHard, :Stacking]
final_estimators = [dtParameters, knnParameters, svmParameters]

for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metricsCV = trainClassEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_balanced_binary_output), crossValidationIndexes; ensembleType = ensemble_type, final_estimator = final_estimator)
        metricsCV["topology"] = final_estimator
        generate_latex_table(metricsCV, false)

        if ensemble_type == :VotingHard
            break
        end

    end
end


for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metrics = createAndTrainFinalEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_balanced_binary_output), (test_input, test_balanced_binary_output); ensembleType = ensemble_type, final_estimator = final_estimator)
        metrics["topology"] = final_estimator
        generate_latex_table(metrics, true)

        if ensemble_type == :VotingHard
            break
        end
    end
end


Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 92.32\% \textit{(0.01)} & 84.69\% \textit{(0.01)} & 100.0\% \textit{(0.0)} & 91.7\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 95.9\% \textit{(0.01)} & 97.82\% \textit{(0.01)} & 93.96\% \textit{(0.03)} & 96.0\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :kNN, "numNeighboors" => 3) & 98.19\% \textit{(0.01)} & 97.75\% \textit{(0.01)} & 98.64\% \textit{(0.01)} & 98.19\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :SVM, "C" => 10, "kernel" => "rbf") & 98.87\% \textit{(0.0)} & 97.75\% \textit{(0.01)} & 100.0\% \textit{(0.0)} & 98.86\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 93.83\%  & 87.5\%  & 100.0\%  & 93.33\% & [574 0; 82 672] \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 94.28\%  & 97.87\%  & 90.77\%  & 94.41\% & [642 62; 

## Third approach: multiclass classification

In [39]:
using MLDataPattern;
Random.seed!(42)
X_bal, y_bal = oversample((input_data', output_data), shuffle = true)
X_bal = getobs(X_bal)'
y_bal = getobs(y_bal)

5900-element PooledArrays.PooledVector{String31, UInt32, Vector{UInt32}}:
 "Benign"
 "Android_Adware"
 "Android_Scareware"
 "Android_Scareware"
 "Android_SMS_Malware"
 "Benign"
 "Android_Adware"
 "Android_SMS_Malware"
 "Android_SMS_Malware"
 "Android_Adware"
 "Android_Scareware"
 "Benign"
 "Benign"
 ⋮
 "Benign"
 "Benign"
 "Android_Adware"
 "Android_Adware"
 "Android_SMS_Malware"
 "Android_SMS_Malware"
 "Android_Scareware"
 "Android_Scareware"
 "Android_Scareware"
 "Android_SMS_Malware"
 "Android_Adware"
 "Android_Adware"

In [40]:
countmap(y_bal)

Dict{String31, Int64} with 4 entries:
  "Benign"              => 1475
  "Android_Scareware"   => 1475
  "Android_SMS_Malware" => 1475
  "Android_Adware"      => 1475

In [41]:
Random.seed!(42)

train_indexes, test_indexes = holdOut(size(X_bal, 1), 0.2)

train_input = convert(Array{Float32, 2}, X_bal[train_indexes, :])
train_balanced_output = y_bal[train_indexes]

normalizationParameters = calculateMinMaxNormalizationParameters(train_input)

normalizeMinMax!(train_input, normalizationParameters)

test_input = convert(Array{Float32, 2}, X_bal[test_indexes, :])
test_balanced_output = y_bal[test_indexes]

normalizeMinMax!(test_input, normalizationParameters)

@assert size(test_input, 1) == size(test_balanced_output, 1)
@assert size(train_input, 1) == size(train_balanced_output, 1)

In [42]:
Random.seed!(42)

kFolds = 10
crossValidationIndexes = crossvalidation(train_balanced_output, kFolds);

### kNN

In [17]:
include("functions.jl")
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 0)

ks = [3 , 5, 7, 10, 15, 20]
for k in ks
    knnParameters["numNeighboors"] = k
    metricsCV = (modelCrossValidation(knnParameters["modelType"], knnParameters, train_input, train_balanced_output, crossValidationIndexes))
    metricsCV["topology"] = k

    generate_latex_table(metricsCV, false)
end

println("----------------------------------------------------------------")
for k in ks
    knnParameters["numNeighboors"] = k
    metrics = createAndTrainFinalModel(knnParameters["modelType"], knnParameters, train_input, train_balanced_output, test_input, test_balanced_output)
    metrics["topology"] = k

    generate_latex_table(metrics, true)
end

3 & 56.4\% \textit{(0.02)} & 56.4\% \textit{(0.02)} & 85.44\% \textit{(0.01)} & 54.53\% \textit{(0.02)} \\
5 & 51.44\% \textit{(0.02)} & 51.44\% \textit{(0.02)} & 83.8\% \textit{(0.01)} & 49.21\% \textit{(0.02)} \\
7 & 47.99\% \textit{(0.03)} & 47.99\% \textit{(0.03)} & 82.65\% \textit{(0.01)} & 46.02\% \textit{(0.03)} \\
10 & 42.48\% \textit{(0.02)} & 42.48\% \textit{(0.02)} & 80.82\% \textit{(0.01)} & 41.2\% \textit{(0.02)} \\
15 & 38.64\% \textit{(0.01)} & 38.64\% \textit{(0.01)} & 79.54\% \textit{(0.0)} & 37.91\% \textit{(0.01)} \\
20 & 38.41\% \textit{(0.02)} & 38.41\% \textit{(0.02)} & 79.47\% \textit{(0.01)} & 37.77\% \textit{(0.02)} \\
----------------------------------------------------------------
3 & 58.47\%  & 58.47\%  & 86.23\%  & 56.74\% & [121 78 64 42; 70 120 55 38; 35 78 152 30; 0 0 0 297] \\
5 & 53.81\%  & 53.81\%  & 84.65\%  & 51.37\% & [85 71 85 64; 55 117 62 49; 38 61 152 44; 4 8 4 281] \\
7 & 51.1\%  & 51.1\%  & 83.74\%  & 48.84\% & [88 64 68 85; 50 115 54 64; 38 

### Decision Tree

In [18]:
include("functions.jl")
dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 1)

depths = [3, 5, 7, 10, 15, typemax(Int)]
for depth in depths
    dtParameters["maxDepth"] = depth
    metricsCV = (modelCrossValidation(dtParameters["modelType"], dtParameters, train_input, train_balanced_output, crossValidationIndexes))
    metricsCV["topology"] = depth

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for depth in depths
    dtParameters["maxDepth"] = depth
    metrics = createAndTrainFinalModel(dtParameters["modelType"], dtParameters, train_input, train_balanced_output, test_input, test_balanced_output)
    metrics["topology"] = depth

    generate_latex_table(metrics, true)

end


3 & 31.44\% \textit{(0.02)} & 31.44\% \textit{(0.02)} & 77.11\% \textit{(0.01)} & 28.49\% \textit{(0.03)} \\
5 & 34.75\% \textit{(0.02)} & 34.75\% \textit{(0.02)} & 78.23\% \textit{(0.01)} & 32.44\% \textit{(0.02)} \\
7 & 39.05\% \textit{(0.03)} & 39.05\% \textit{(0.03)} & 79.65\% \textit{(0.01)} & 37.49\% \textit{(0.03)} \\
10 & 48.71\% \textit{(0.02)} & 48.71\% \textit{(0.02)} & 82.87\% \textit{(0.01)} & 48.08\% \textit{(0.03)} \\
15 & 62.16\% \textit{(0.04)} & 62.16\% \textit{(0.04)} & 87.38\% \textit{(0.01)} & 61.43\% \textit{(0.03)} \\
9223372036854775807 & 69.98\% \textit{(0.02)} & 69.98\% \textit{(0.02)} & 89.99\% \textit{(0.01)} & 68.65\% \textit{(0.02)} \\
----------------------------------------------------------------
3 & 32.29\%  & 32.29\%  & 77.45\%  & 30.71\% & [39 68 77 121; 54 83 45 101; 34 51 100 110; 36 58 44 159] \\
5 & 33.05\%  & 33.05\%  & 77.33\%  & 30.8\% & [95 23 69 118; 108 26 43 106; 70 18 101 106; 84 7 38 168] \\
7 & 40.0\%  & 40.0\%  & 79.72\%  & 39.11\% & [

### SVM

In [19]:
include("functions.jl")
svmParameters = Dict("modelType" => :SVM, "C" => 1, "kernel" => "linear", "degree" => 3, "gamma" => "scale")

svms = [
    ("rbf", 0.1),
    ("rbf", 1.0),
    ("rbf", 10.0),
    ("poly", 0.1),
    ("poly", 1.0),
    ("linear", 0.1),
    ("linear", 1.0),
    ("linear", 10.0),
]

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metricsCV = (modelCrossValidation(svmParameters["modelType"], svmParameters, train_input, train_balanced_output, crossValidationIndexes))
    metricsCV["topology"] = kernel * " & " * string(C)

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metrics = createAndTrainFinalModel(svmParameters["modelType"], svmParameters, train_input, train_balanced_output, test_input, test_balanced_output)
    metrics["topology"] = kernel * " & " * string(C)

    generate_latex_table(metrics, true)

end


rbf & 0.1 & 30.93\% \textit{(0.02)} & 30.93\% \textit{(0.02)} & 76.86\% \textit{(0.01)} & 27.12\% \textit{(0.01)} \\
rbf & 1.0 & 33.07\% \textit{(0.02)} & 33.07\% \textit{(0.02)} & 77.6\% \textit{(0.01)} & 30.79\% \textit{(0.02)} \\
rbf & 10.0 & 37.9\% \textit{(0.02)} & 37.9\% \textit{(0.02)} & 79.26\% \textit{(0.01)} & 37.06\% \textit{(0.02)} \\
poly & 0.1 & 31.04\% \textit{(0.01)} & 31.04\% \textit{(0.01)} & 76.84\% \textit{(0.0)} & 27.22\% \textit{(0.01)} \\
poly & 1.0 & 34.87\% \textit{(0.02)} & 34.87\% \textit{(0.02)} & 78.19\% \textit{(0.01)} & 32.8\% \textit{(0.02)} \\
linear & 0.1 & 30.87\% \textit{(0.01)} & 30.87\% \textit{(0.01)} & 76.84\% \textit{(0.0)} & 27.77\% \textit{(0.01)} \\
linear & 1.0 & 31.67\% \textit{(0.01)} & 31.67\% \textit{(0.01)} & 77.12\% \textit{(0.0)} & 28.94\% \textit{(0.01)} \\
linear & 10.0 & 31.93\% \textit{(0.02)} & 31.93\% \textit{(0.02)} & 77.21\% \textit{(0.01)} & 30.06\% \textit{(0.02)} \\
----------------------------------------------------------

### ANN

In [81]:
include("functions.jl")

topologies = [[20], [40], [80], [100], [60, 120], [80, 50], [80, 100], [100, 40]]
annParameters = Dict("modelType" => :ANN, "maxEpochs" => 200,
    "learningRate" => 0.01, "maxEpochsVal" => 30,
    "repetitions" => 30, "validationRatio" => 0.1,
    "transferFunctions" => fill(σ, 2))

for topology in topologies
    annParameters["topology"] = topology
    metricsCV = modelCrossValidation(annParameters["modelType"], annParameters, train_input, train_balanced_output, crossValidationIndexes)
    metricsCV["topology"] = topology 

    generate_latex_table(metricsCV, false)
end

for topology in topologies
    annParameters["topology"] = topology
    metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_balanced_output, test_input, test_balanced_output)
    metrics["topology"] = topology 

    generate_latex_table(metrics, true)
end

[20] & 74.96\% \textit{(0.0)} & 0.35\% \textit{(0.0)} & 99.84\% \textit{(0.0)} & 0.69\% \textit{(0.0)} \\
[40] & 74.98\% \textit{(0.0)} & 0.5\% \textit{(0.0)} & 99.81\% \textit{(0.0)} & 0.98\% \textit{(0.0)} \\
[80] & 74.96\% \textit{(0.0)} & 0.53\% \textit{(0.0)} & 99.77\% \textit{(0.0)} & 1.04\% \textit{(0.0)} \\
[100] & 74.95\% \textit{(0.0)} & 0.59\% \textit{(0.0)} & 99.73\% \textit{(0.0)} & 1.15\% \textit{(0.0)} \\
[60, 120] & 74.94\% \textit{(0.0)} & 0.61\% \textit{(0.0)} & 99.71\% \textit{(0.0)} & 1.17\% \textit{(0.0)} \\
[80, 50] & 74.98\% \textit{(0.0)} & 0.15\% \textit{(0.0)} & 99.92\% \textit{(0.0)} & 0.3\% \textit{(0.0)} \\
[80, 100] & 74.93\% \textit{(0.0)} & 0.59\% \textit{(0.0)} & 99.71\% \textit{(0.0)} & 1.11\% \textit{(0.0)} \\
[100, 40] & 74.99\% \textit{(0.0)} & 0.15\% \textit{(0.0)} & 99.93\% \textit{(0.0)} & 0.3\% \textit{(0.0)} \\
[20] & 74.85\%  & 0.42\%  & 99.66\%  & 0.84\% & [5 12; 1175 3528] \\
[40] & 74.72\%  & 0.17\%  & 99.58\%  & 0.33\% & [2 15; 1178 3525] 

In [43]:
include("functions.jl")

annParameters = Dict("modelType" => :ANN, "maxEpochs" => 1000,
    "learningRate" => 0.01, "maxEpochsVal" => 30,
    "repetitions" => 30, "validationRatio" => 0.1,
    "transferFunctions" => fill(σ, 2), "topology" => [100, 40])
metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_balanced_output, test_input, test_balanced_output)

(1180, 4)
(4720, 4)
(4248, 4)(1180, 4)
(1180, 4)


Dict{String, Any} with 8 entries:
  "errorRate"                 => 0.749153
  "f1_score"                  => 0.169678
  "specificity"               => 0.749049
  "negative_predictive_value" => 0.747199
  "confusion_matrix"          => [0 4 113 188; 0 0 85 198; 0 1 133 161; 0 6 128…
  "accuracy"                  => 0.250847
  "recall"                    => 0.250847
  "precision"                 => 0.130224

In [88]:
test_balanced_output

1180-element PooledArrays.PooledVector{String31, UInt32, Vector{UInt32}}:
 "Android_Scareware"
 "Android_Scareware"
 "Android_Adware"
 "Android_Adware"
 "Android_SMS_Malware"
 "Android_Scareware"
 "Benign"
 "Android_Scareware"
 "Android_Adware"
 "Benign"
 "Android_Adware"
 "Android_SMS_Malware"
 "Benign"
 ⋮
 "Android_SMS_Malware"
 "Android_SMS_Malware"
 "Android_Adware"
 "Android_SMS_Malware"
 "Android_Adware"
 "Android_Scareware"
 "Android_Scareware"
 "Android_Adware"
 "Android_Scareware"
 "Benign"
 "Android_Scareware"
 "Android_Scareware"

### Ensembles

In [61]:
include("functions.jl")

dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => typemax(Int))
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 3)
svmParameters = Dict("modelType" => :SVM, "kernel" => "rbf", "C" => 10)
Random.seed!(42)

ensemble_types = [:VotingHard, :Stacking]
final_estimators = [dtParameters, knnParameters, svmParameters]

for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metricsCV = trainClassEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_balanced_output), crossValidationIndexes; ensembleType = ensemble_type, final_estimator = final_estimator)
        metricsCV["topology"] = final_estimator
        generate_latex_table(metricsCV, false)

        if ensemble_type == :VotingHard
            break
        end

    end
end


for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metrics = createAndTrainFinalEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_balanced_output), (test_input, test_balanced_output); ensembleType = ensemble_type, final_estimator = final_estimator)
        metrics["topology"] = final_estimator
        generate_latex_table(metrics, true)

        if ensemble_type == :VotingHard
            break
        end
    end
end


Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 61.82\% \textit{(0.02)} & 61.82\% \textit{(0.02)} & 87.22\% \textit{(0.01)} & 60.26\% \textit{(0.02)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 60.13\% \textit{(0.01)} & 60.13\% \textit{(0.01)} & 86.7\% \textit{(0.0)} & 60.24\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :kNN, "numNeighboors" => 3) & 63.86\% \textit{(0.02)} & 63.86\% \textit{(0.02)} & 87.92\% \textit{(0.01)} & 63.27\% \textit{(0.02)} \\
Dict{String, Any}("modelType" => :SVM, "C" => 10, "kernel" => "rbf") & 70.83\% \textit{(0.02)} & 70.83\% \textit{(0.02)} & 90.28\% \textit{(0.01)} & 69.92\% \textit{(0.02)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 65.25\%  & 65.25\%  & 88.6\%  & 63.87\% & [113 108 54 30; 52 174 39 18; 8 77 186 24; 0 0 0 297] \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 64.32\%  

In [62]:
countmap(test_balanced_output)

Dict{String31, Int64} with 4 entries:
  "Benign"              => 297
  "Android_Scareware"   => 305
  "Android_SMS_Malware" => 295
  "Android_Adware"      => 283

In [64]:
@sk_import decomposition:PCA


PyObject <class 'sklearn.decomposition._pca.PCA'>

In [76]:

# pcas = 1:20:4
pca_value = 10
pca = PCA(pca_value)

#Ajust the matrix acording to the train data
fit!(pca, train_input)

#Once it is ajusted it can be used to transform the data
pca_train = pca.transform(train_input)
pca_test = pca.transform(test_input)

@assert (size(train_input)[1],pca_value) == size(pca_train)
@assert (size(test_input)[1],pca_value) == size(pca_test)

In [74]:
size(train_input)[1],2

(4720, 2)

In [80]:
# pcas = [2, 6, 10, 15, 20, 25, 30]

include("functions.jl")
dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 1)

depths = [3, 5, 7, 10, 15, typemax(Int)]

pca_value = 5
pca = PCA(pca_value)

#Ajust the matrix acording to the train data
fit!(pca, train_input)

#Once it is ajusted it can be used to transform the data
pca_train = pca.transform(train_input)
pca_test = pca.transform(test_input)

@assert (size(train_input)[1],pca_value) == size(pca_train)
@assert (size(test_input)[1],pca_value) == size(pca_test)


for depth in depths
    dtParameters["maxDepth"] = depth
    metricsCV = (modelCrossValidation(dtParameters["modelType"], dtParameters, pca_train, train_balanced_output, crossValidationIndexes))
    metricsCV["topology"] = depth

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for depth in depths
    dtParameters["maxDepth"] = depth
    metrics = createAndTrainFinalModel(dtParameters["modelType"], dtParameters, pca_train, train_balanced_output, pca_test, test_balanced_output)
    metrics["topology"] = depth

    generate_latex_table(metrics, true)

end


3 & 28.92\% \textit{(0.02)} & 28.92\% \textit{(0.02)} & 76.25\% \textit{(0.01)} & 23.94\% \textit{(0.02)} \\
5 & 33.41\% \textit{(0.02)} & 33.41\% \textit{(0.02)} & 77.77\% \textit{(0.01)} & 30.01\% \textit{(0.03)} \\
7 & 36.67\% \textit{(0.02)} & 36.67\% \textit{(0.02)} & 78.86\% \textit{(0.01)} & 33.65\% \textit{(0.02)} \\
10 & 44.36\% \textit{(0.02)} & 44.36\% \textit{(0.02)} & 81.45\% \textit{(0.01)} & 42.37\% \textit{(0.02)} \\
15 & 55.23\% \textit{(0.03)} & 55.23\% \textit{(0.03)} & 85.07\% \textit{(0.01)} & 53.35\% \textit{(0.04)} \\
9223372036854775807 & 69.24\% \textit{(0.02)} & 69.24\% \textit{(0.02)} & 89.75\% \textit{(0.01)} & 67.69\% \textit{(0.02)} \\
----------------------------------------------------------------
3 & 28.39\%  & 28.39\%  & 76.29\%  & 24.85\% & [13 70 75 147; 8 102 52 121; 7 70 58 160; 8 55 72 162] \\
5 & 30.68\%  & 30.68\%  & 77.11\%  & 27.84\% & [17 93 103 92; 20 104 73 86; 11 88 90 106; 11 47 88 151] \\
7 & 35.59\%  & 35.59\%  & 78.65\%  & 32.17\% & [2