In [1]:
using CSV
using DataFrames
using Random

include("functions.jl");


In [2]:
using ScikitLearn

@sk_import svm:SVC;
@sk_import tree:DecisionTreeClassifier;
@sk_import ensemble:VotingClassifier
@sk_import neighbors: KNeighborsClassifier;


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mmkl not found, proceeding to installing non-mkl versions of sci-kit learn via Conda
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda install -y -c conda-forge 'scikit-learn>=1.2,<1.3'` in root environment


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.





  current version: 4.12.0
  latest version: 23.11.0

Please update conda by running

    $ conda update -n base conda




## Read Data

In [3]:
file_path = "dataset/super_simplified_Android_Malware.csv"

data = CSV.File(file_path, header=true) |> DataFrame;

In [4]:
describe(data, :all)

Row,variable,mean,std,min,q25,median,q75,max,sum,nunique,nuniqueall,nmissing,nnonmissing,first,last,eltype
Unnamed: 0_level_1,Symbol,Union…,Union…,Any,Union…,Union…,Union…,Any,Union…,Union…,Int64,Int64,Int64,Any,Any,DataType
1,Flow ID,,,1.31.173.21-10.42.0.151-80-36854-6,,,,8.8.8.8-10.42.0.211-53-3181-17,,3502,3502,0,3557,157.240.0.36-10.42.0.211-443-55364-6,10.42.0.211-10.42.0.1-3890-53-17,String
2,Source IP,,,0.0.0.0,,,,96.6.164.184,,246,246,0,3557,10.42.0.211,10.42.0.211,String15
3,Source Port,39310.9,17860.1,0,34948.0,43520.0,52110.0,65400,139828814,,3013,0,3557,55364,3890,Int64
4,Destination IP,,,1.31.173.21,,,,98.139.225.43,,826,826,0,3557,157.240.0.36,10.42.0.1,String15
5,Destination Port,5390.09,14590.5,0.0,80.0,443.0,443.0,60729.0,1.91725e7,,400,0,3557,443.0,53.0,Float64
6,Protocol,8.26061,4.51586,0.0,6.0,6.0,6.0,17.0,29383.0,,3,0,3557,6.0,17.0,Float64
7,Timestamp,,,04/07/2017 10:08:16,,,,30/06/2017 12:59:10,,3332,3332,0,3557,26/06/2017 12:43:14,13/06/2017 08:39:01,String31
8,Flow Duration,1.11761e7,2.19812e7,2,48777.0,557126.0,1.08321e7,119977227,39753215647,,3442,0,3557,65319091,48681,Int64
9,Total Fwd Packets,7.5932,59.0075,1,1.0,2.0,5.0,3246,27009,,97,0,3557,9,1,Int64
10,Total Backward Packets,11.0748,159.416,0,0.0,1.0,4.0,8452,39393,,117,0,3557,9,1,Int64


In [5]:
import StatsBase: countmap

columns_to_drop = ["Flow ID", " Timestamp"]
columns = names(data)

println("Size of dataframe before dropping columns $(size(data))")
for column in 1:size(data, 2)
    unique_values = countmap(data[:, column])

    if length(unique_values) == 1
        println("Adding column $(columns[column])")
        # println(unique_values)
        push!(columns_to_drop, columns[column])
    end
    
end

select!(data, Not(columns_to_drop))

println("Size of dataframe after dropping columns $(size(data))")

dropmissing!(data)

println("Size of dataframe after dropping nulls $(size(data))")

unique_data = unique(data)

println("Size of dataframe after dropping duplicating rows $(size(data))")

Size of dataframe before dropping columns (3557, 85)
Adding column  Bwd PSH Flags
Adding column  Fwd URG Flags
Adding column  Bwd URG Flags
Adding column  RST Flag Count
Adding column  CWE Flag Count
Adding column  ECE Flag Count
Adding column Fwd Avg Bytes/Bulk
Adding column  Fwd Avg Packets/Bulk
Adding column  Fwd Avg Bulk Rate
Adding column  Bwd Avg Bytes/Bulk
Adding column  Bwd Avg Packets/Bulk
Adding column Bwd Avg Bulk Rate
Size of dataframe after dropping columns (3557, 71)
Size of dataframe after dropping nulls (3557, 71)
Size of dataframe after dropping duplicating rows (3557, 71)


In [6]:
countmap(data[:, :Label])

Dict{String31, Int64} with 4 entries:
  "Benign"              => 237
  "Android_Scareware"   => 1171
  "Android_SMS_Malware" => 674
  "Android_Adware"      => 1475

In [7]:
function ip_to_decimal(ip)
    # Split the IP address into octets
    octets = split(ip, '.')
    # Convert each octet to binary and combine them into a single 32-bit number
    binary = join([string(parse(Int, octet, base=10), base=2, pad=8) for octet in octets])
    decimal = parse(Int, binary, base=2) # Convert binary to decimal
    return decimal
end

source_ips = data[!, :" Source IP"];
destination_ips = data[!, :" Destination IP"];

data[!, :"Source IP Decimal"] = map(ip -> ip_to_decimal(ip), source_ips);
data[!, :"Destination IP Decimal"] = map(ip -> ip_to_decimal(ip), destination_ips);

select!(data, Not([" Source IP", " Destination IP"]));

In [8]:
describe(data)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,Source Port,39310.9,0,43520.0,65400,0,Int64
2,Destination Port,5390.09,0.0,443.0,60729.0,0,Float64
3,Protocol,8.26061,0.0,6.0,17.0,0,Float64
4,Flow Duration,1.11761e7,2,557126.0,119977227,0,Int64
5,Total Fwd Packets,7.5932,1,2.0,3246,0,Int64
6,Total Backward Packets,11.0748,0,1.0,8452,0,Int64
7,Total Length of Fwd Packets,665.05,0.0,31.0,110678.0,0,Float64
8,Total Length of Bwd Packets,12084.5,0.0,17.0,1.22247e7,0,Float64
9,Fwd Packet Length Max,218.112,0.0,31.0,1460.0,0,Float64
10,Fwd Packet Length Min,11.5516,0.0,0.0,1460.0,0,Float64


In [9]:
output_data = data[!, :Label];
select!(data, Not(:Label))
input_data = Matrix(data[!, 1:size(data, 2)]);

## First approach: binary classification

In [10]:
function transform_binary_class(output_data)
    binary_labels = output_data .!= "Benign"
    return binary_labels
end

binary_labels = transform_binary_class(output_data)
@assert binary_labels isa BitVector
@assert input_data isa Matrix

### Preprocessing

In [12]:
Random.seed!(42)

train_indexes, test_indexes = holdOut(size(input_data, 1), 0.2)

train_input = convert(Array{Float32, 2}, input_data[train_indexes, :])
train_binary_output = binary_labels[train_indexes]

normalizationParameters = calculateMinMaxNormalizationParameters(train_input)

normalizeMinMax!(train_input, normalizationParameters)

test_input = convert(Array{Float32, 2}, input_data[test_indexes, :])
test_binary_output = binary_labels[test_indexes]

normalizeMinMax!(test_input, normalizationParameters)

@assert size(test_input, 1) == size(test_binary_output, 1)
@assert size(train_input, 1) == size(train_binary_output, 1)

In [14]:
Random.seed!(42)

kFolds = 10
crossValidationIndexes = crossvalidation(train_binary_output, kFolds);

In [15]:
function generate_latex_table(metrics::Dict{String, <: Any}, final::Bool)
    
    topology = metrics["topology"]
    accuracy = metrics["accuracy"]
    recall = metrics["recall"]
    specificity = metrics["specificity"]
    f1_score = metrics["f1_score"]
    
    if final
        confusion_matrix = metrics["confusion_matrix"]
        println("$topology & $(round(accuracy*100, digits=2))\\%  & $(round(recall*100, digits=2))\\%  & $(round(specificity*100, digits=2))\\%  & $(round(f1_score*100, digits=2))\\% & $confusion_matrix \\\\")
    else
        std_accuracy = metrics["std_accuracy"]
        std_recall = metrics["std_recall"]
        std_specificity = metrics["std_specificity"]
        std_f1_score = metrics["std_f1_score"]
        println("$topology & $(round(accuracy*100, digits=2))\\% \\textit{($(round(std_accuracy, digits = 2)))} & $(round(recall*100, digits=2))\\% \\textit{($(round(std_recall, digits = 2)))} & $(round(specificity*100, digits=2))\\% \\textit{($(round(std_specificity, digits = 2)))} & $(round(f1_score*100, digits=2))\\% \\textit{($(round(std_f1_score, digits = 2)))} \\\\")
    end
    
end

generate_latex_table (generic function with 1 method)

### kNN

In [16]:
include("functions.jl")
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 0)

ks = [3 , 5, 7, 10, 15, 20]
for k in ks
    knnParameters["numNeighboors"] = k
    metricsCV = (modelCrossValidation(knnParameters["modelType"], knnParameters, train_input, train_binary_output, crossValidationIndexes))
    metricsCV["topology"] = k

    generate_latex_table(metricsCV, false)
end

println("----------------------------------------------------------------")
for k in ks
    knnParameters["numNeighboors"] = k
    metrics = createAndTrainFinalModel(knnParameters["modelType"], knnParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = k

    generate_latex_table(metrics, true)
end

3 & 92.79\% \textit{(0.01)} & 98.95\% \textit{(0.01)} & 1.14\% \textit{(0.02)} & 96.26\% \textit{(0.0)} \\
5 & 93.43\% \textit{(0.0)} & 99.7\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.6\% \textit{(0.0)} \\
7 & 93.57\% \textit{(0.0)} & 99.85\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.68\% \textit{(0.0)} \\
10 & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
15 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
20 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
----------------------------------------------------------------
3 & 90.03\%  & 97.86\%  & 1.72\%  & 94.74\% & [640 57; 14 1] \\
5 & 91.43\%  & 99.54\%  & 0.0\%  & 95.52\% & [651 58; 3 0] \\
7 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
10 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
15 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \

### Decision Tree

In [17]:
include("functions.jl")
dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 1)

depths = [3, 5, 7, 10, 15, typemax(Int)]
for depth in depths
    dtParameters["maxDepth"] = depth
    metricsCV = (modelCrossValidation(dtParameters["modelType"], dtParameters, train_input, train_binary_output, crossValidationIndexes))
    metricsCV["topology"] = depth

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for depth in depths
    dtParameters["maxDepth"] = depth
    metrics = createAndTrainFinalModel(dtParameters["modelType"], dtParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = depth

    generate_latex_table(metrics, true)

end


3 & 93.22\% \textit{(0.01)} & 99.47\% \textit{(0.01)} & 0.0\% \textit{(0.0)} & 96.49\% \textit{(0.0)} \\
5 & 92.9\% \textit{(0.01)} & 99.06\% \textit{(0.01)} & 1.11\% \textit{(0.02)} & 96.32\% \textit{(0.0)} \\
7 & 92.02\% \textit{(0.01)} & 98.09\% \textit{(0.01)} & 1.67\% \textit{(0.03)} & 95.84\% \textit{(0.01)} \\
10 & 90.65\% \textit{(0.01)} & 96.51\% \textit{(0.02)} & 3.37\% \textit{(0.04)} & 95.08\% \textit{(0.01)} \\
15 & 88.02\% \textit{(0.02)} & 93.66\% \textit{(0.03)} & 3.89\% \textit{(0.05)} & 93.59\% \textit{(0.01)} \\
9223372036854775807 & 86.93\% \textit{(0.02)} & 92.39\% \textit{(0.02)} & 5.59\% \textit{(0.06)} & 92.97\% \textit{(0.01)} \\
----------------------------------------------------------------
3 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
5 & 91.71\%  & 99.85\%  & 0.0\%  & 95.68\% & [653 58; 1 0] \\
7 & 89.61\%  & 97.55\%  & 0.0\%  & 94.52\% & [638 58; 16 0] \\
10 & 88.06\%  & 95.72\%  & 1.72\%  & 93.64\% & [626 57; 28 1] \\
15 & 85.25\%  & 92.6

### SVM

In [14]:
include("functions.jl")
svmParameters = Dict("modelType" => :SVM, "C" => 1, "kernel" => "linear", "degree" => 3, "gamma" => "scale")

svms = [
    ("rbf", 0.1),
    ("rbf", 1.0),
    ("rbf", 10.0),
    ("poly", 0.1),
    ("poly", 1.0),
    ("linear", 0.1),
    ("linear", 1.0),
    ("linear", 10.0),
]

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metricsCV = (modelCrossValidation(svmParameters["modelType"], svmParameters, train_input, train_binary_output, crossValidationIndexes))
    metricsCV["topology"] = kernel * string(C)

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metrics = createAndTrainFinalModel(svmParameters["modelType"], svmParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = kernel * string(C)

    generate_latex_table(metrics, true)

end


rbf0.1 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
rbf1.0 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
rbf10.0 & 93.64\% \textit{(0.0)} & 99.93\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.71\% \textit{(0.0)} \\
poly0.1 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
poly1.0 & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
linear0.1 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
linear1.0 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
linear10.0 & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
----------------------------------------------------------------
rbf0.1 & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58;

### ANN

In [15]:
include("functions.jl")

#topologies = [[20], [40], [80], [100]]
topologies = [[60, 120], [80, 50], [80, 100], [100, 40]]
annParameters = Dict("modelType" => :ANN, "maxEpochs" => 200,
    "learningRate" => 0.01, "maxEpochsVal" => 30,
    "repetitions" => 30, "validationRatio" => 0.1,
    "transferFunctions" => fill(σ, 2))

for topology in topologies
    annParameters["topology"] = topology
    metricsCV = modelCrossValidation(annParameters["modelType"], annParameters, train_input, train_binary_output, crossValidationIndexes)
    metricsCV["topology"] = topology 

    generate_latex_table(metricsCV, false)
end

for topology in topologies
    annParameters["topology"] = topology
    metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_binary_output, test_input, test_binary_output)
    metrics["topology"] = topology 

    generate_latex_table(metrics, true)
end

[60, 120] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[80, 50] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[80, 100] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[100, 40] & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)} \\
[60, 120] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
[80, 50] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
[80, 100] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\
[100, 40] & 91.85\%  & 100.0\%  & 0.0\%  & 95.75\% & [654 58; 0 0] \\


### Ensembles

In [19]:
@sk_import ensemble:StackingClassifier
@sk_import ensemble:VotingClassifier




PyObject <class 'sklearn.ensemble._voting.VotingClassifier'>

In [20]:
include("functions.jl")

dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 5)
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 3)
svmParameters = Dict("modelType" => :SVM, "kernel" => "rbf", "C" => 10)
Random.seed!(42)

ensemble_types = [:VotingHard, :Stacking]
final_estimators = [dtParameters, knnParameters, svmParameters]

for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metricsCV = trainClassEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_binary_output), crossValidationIndexes; ensembleType = ensemble_type, final_estimator = final_estimator)
        metricsCV["topology"] = final_estimator
        generate_latex_table(metricsCV, false)

    end
end


for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metrics = createAndTrainFinalEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_binary_output), (test_input, test_binary_output); ensembleType = ensemble_type, final_estimator = final_estimator)
        metrics["topology"] = final_estimator
        generate_latex_table(metrics, true)
    end
end


Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 5) & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :kNN, "numNeighboors" => 3) & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :SVM, "C" => 10, "kernel" => "rbf") & 93.67\% \textit{(0.0)} & 99.96\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.73\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 5) & 93.04\% \textit{(0.01)} & 99.21\% \textit{(0.01)} & 1.11\% \textit{(0.02)} & 96.39\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :kNN, "numNeighboors" => 3) & 92.62\% \textit{(0.01)} & 98.8\% \textit{(0.01)} & 0.56\% \textit{(0.02)} & 96.16\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :SVM, "C" => 10, "kernel" => "rbf") & 93.71\% \textit{(0.0)} & 100.0\% \textit{(0.0)} & 0.0\% \textit{(0.0)} & 96.75\% \textit{(0.0)

## Second Approach

### Data balancing

In [21]:
using MLDataPattern;
Random.seed!(42)
X_bal, y_bal = oversample((input_data', binary_labels), shuffle = true)
X_bal = getobs(X_bal)'
y_bal = getobs(y_bal)

6640-element BitVector:
 1
 1
 1
 0
 0
 0
 1
 1
 1
 1
 1
 0
 0
 ⋮
 1
 0
 0
 0
 0
 1
 1
 1
 0
 1
 1
 0

In [27]:
countmap(y_bal)

Dict{Bool, Int64} with 2 entries:
  0 => 3320
  1 => 3320

In [24]:
Random.seed!(42)

train_indexes, test_indexes = holdOut(size(X_bal, 1), 0.2)

train_input = convert(Array{Float32, 2}, X_bal[train_indexes, :])
train_balanced_binary_output = y_bal[train_indexes]

normalizationParameters = calculateMinMaxNormalizationParameters(train_input)

normalizeMinMax!(train_input, normalizationParameters)

test_input = convert(Array{Float32, 2}, X_bal[test_indexes, :])
test_balanced_binary_output = y_bal[test_indexes]

normalizeMinMax!(test_input, normalizationParameters)

@assert size(test_input, 1) == size(test_balanced_binary_output, 1)
@assert size(train_input, 1) == size(train_balanced_binary_output, 1)

In [25]:
Random.seed!(42)

kFolds = 10
crossValidationIndexes = crossvalidation(train_balanced_binary_output, kFolds);

### kNN

In [26]:
include("functions.jl")
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 0)

ks = [3 , 5, 7, 10, 15, 20]
for k in ks
    knnParameters["numNeighboors"] = k
    metricsCV = (modelCrossValidation(knnParameters["modelType"], knnParameters, train_input, train_balanced_binary_output, crossValidationIndexes))
    metricsCV["topology"] = k

    generate_latex_table(metricsCV, false)
end

println("----------------------------------------------------------------")
for k in ks
    knnParameters["numNeighboors"] = k
    metrics = createAndTrainFinalModel(knnParameters["modelType"], knnParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = k

    generate_latex_table(metrics, true)
end

3 & 90.98\% \textit{(0.01)} & 82.02\% \textit{(0.01)} & 100.0\% \textit{(0.0)} & 90.11\% \textit{(0.01)} \\
5 & 87.71\% \textit{(0.01)} & 75.49\% \textit{(0.01)} & 100.0\% \textit{(0.0)} & 86.03\% \textit{(0.01)} \\
7 & 84.38\% \textit{(0.01)} & 68.85\% \textit{(0.02)} & 100.0\% \textit{(0.0)} & 81.54\% \textit{(0.01)} \\
10 & 78.56\% \textit{(0.01)} & 57.24\% \textit{(0.01)} & 100.0\% \textit{(0.0)} & 72.8\% \textit{(0.01)} \\
15 & 71.95\% \textit{(0.01)} & 50.71\% \textit{(0.03)} & 93.32\% \textit{(0.02)} & 64.42\% \textit{(0.02)} \\
20 & 66.51\% \textit{(0.02)} & 48.99\% \textit{(0.02)} & 84.14\% \textit{(0.02)} & 59.46\% \textit{(0.02)} \\
----------------------------------------------------------------
3 & 91.94\%  & 83.69\%  & 100.0\%  & 91.12\% & [549 0; 107 672] \\
5 & 88.33\%  & 76.37\%  & 100.0\%  & 86.6\% & [501 0; 155 672] \\
7 & 85.17\%  & 69.97\%  & 100.0\%  & 82.33\% & [459 0; 197 672] \\
10 & 80.72\%  & 60.98\%  & 100.0\%  & 75.76\% & [400 0; 256 672] \\
15 & 74.47\%  &

### Decision Tree

In [172]:
include("functions.jl")
dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 1)

depths = [3, 5, 7, 10, 15, nothing]
for depth in depths
    dtParameters["maxDepth"] = depth
    metricsCV = (modelCrossValidation(dtParameters["modelType"], dtParameters, train_input, train_balanced_binary_output, crossValidationIndexes))
    metricsCV["topology"] = depth

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for depth in depths
    dtParameters["maxDepth"] = depth
    metrics = createAndTrainFinalModel(dtParameters["modelType"], dtParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = depth

    generate_latex_table(metrics, true)

end


3 & 58.06\% \textit{(0.03)} & 39.6\% \textit{(0.19)} & 76.62\% \textit{(0.15)} & 46.11\% \textit{(0.14)} \\
5 & 64.25\% \textit{(0.03)} & 48.35\% \textit{(0.14)} & 80.25\% \textit{(0.13)} & 56.43\% \textit{(0.09)} \\
7 & 71.89\% \textit{(0.02)} & 58.19\% \textit{(0.09)} & 85.69\% \textit{(0.11)} & 67.25\% \textit{(0.04)} \\
10 & 80.97\% \textit{(0.02)} & 71.25\% \textit{(0.05)} & 90.75\% \textit{(0.04)} & 78.9\% \textit{(0.03)} \\
15 & 90.0\% \textit{(0.02)} & 81.05\% \textit{(0.04)} & 99.02\% \textit{(0.02)} & 89.0\% \textit{(0.02)} \\
nothing & 95.5\% \textit{(0.01)} & 91.03\% \textit{(0.02)} & 100.0\% \textit{(0.0)} & 95.3\% \textit{(0.01)} \\
----------------------------------------------------------------
3 & 56.4\%  & 16.46\%  & 95.39\%  & 27.17\% & [108 31; 548 641] \\
5 & 62.5\%  & 28.05\%  & 96.13\%  & 42.49\% & [184 26; 472 646] \\
7 & 69.95\%  & 74.85\%  & 65.18\%  & 71.11\% & [491 234; 165 438] \\
10 & 78.09\%  & 70.12\%  & 85.86\%  & 75.97\% & [460 95; 196 577] \\
15 & 88.

### SVM

In [167]:
include("functions.jl")
svmParameters = Dict("modelType" => :SVM, "C" => 1, "kernel" => "linear", "degree" => 3, "gamma" => "scale")

svms = [
    ("rbf", 0.1),
    ("rbf", 1.0),
    ("rbf", 10.0),
    ("poly", 0.1),
    ("poly", 1.0),
    ("linear", 0.1),
    ("linear", 1.0),
    ("linear", 10.0),
]

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metricsCV = (modelCrossValidation(svmParameters["modelType"], svmParameters, train_input, train_balanced_binary_output, crossValidationIndexes))
    metricsCV["topology"] = kernel * " & " * string(C)

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for (kernel, C) in svms
    svmParameters["kernel"] = kernel
    svmParameters["C"] = C
    metrics = createAndTrainFinalModel(svmParameters["modelType"], svmParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = kernel * " & " * string(C)

    generate_latex_table(metrics, true)

end


rbf0.1 & 55.87\% \textit{(0.02)} & 58.0\% \textit{(0.05)} & 53.74\% \textit{(0.07)} & 56.8\% \textit{(0.02)} \\
rbf1.0 & 60.54\% \textit{(0.03)} & 59.99\% \textit{(0.03)} & 61.1\% \textit{(0.03)} & 60.38\% \textit{(0.03)} \\
rbf10.0 & 66.83\% \textit{(0.03)} & 61.34\% \textit{(0.03)} & 72.35\% \textit{(0.05)} & 64.98\% \textit{(0.03)} \\
poly0.1 & 57.98\% \textit{(0.02)} & 73.65\% \textit{(0.03)} & 42.22\% \textit{(0.05)} & 63.73\% \textit{(0.02)} \\
poly1.0 & 61.56\% \textit{(0.03)} & 65.65\% \textit{(0.03)} & 57.44\% \textit{(0.05)} & 63.14\% \textit{(0.02)} \\
linear0.1 & 54.82\% \textit{(0.02)} & 61.04\% \textit{(0.04)} & 48.57\% \textit{(0.05)} & 57.49\% \textit{(0.02)} \\
linear1.0 & 56.46\% \textit{(0.03)} & 65.09\% \textit{(0.03)} & 47.77\% \textit{(0.04)} & 59.99\% \textit{(0.02)} \\
linear10.0 & 57.59\% \textit{(0.02)} & 66.78\% \textit{(0.04)} & 48.34\% \textit{(0.05)} & 61.21\% \textit{(0.02)} \\
----------------------------------------------------------------
rbf0.1 & 60.0

### ANN

In [113]:
include("functions.jl")

topologies = [[20], [40], [80], [100], [60, 120], [80, 50], [80, 100], [100, 40]]
annParameters = Dict("modelType" => :ANN, "maxEpochs" => 200,
    "learningRate" => 0.01, "maxEpochsVal" => 30,
    "repetitions" => 30, "validationRatio" => 0.1,
    "transferFunctions" => fill(σ, 2))

for topology in topologies
    annParameters["topology"] = topology
    metricsCV = modelCrossValidation(annParameters["modelType"], annParameters, train_input, train_balanced_binary_output, crossValidationIndexes)
    metricsCV["topology"] = topology 

    generate_latex_table(metricsCV, false)
end

for topology in topologies
    annParameters["topology"] = topology
    metrics = createAndTrainFinalModel(annParameters["modelType"], annParameters, train_input, train_balanced_binary_output, test_input, test_balanced_binary_output)
    metrics["topology"] = topology 

    generate_latex_table(metrics, true)
end

[20] & 52.58\% \textit{(0.01)} & 54.52\% \textit{(0.08)} & 50.63\% \textit{(0.08)} & 49.1\% \textit{(0.05)} \\
[40] & 51.29\% \textit{(0.0)} & 55.18\% \textit{(0.06)} & 47.37\% \textit{(0.05)} & 43.84\% \textit{(0.04)} \\
[80] & 50.78\% \textit{(0.0)} & 56.05\% \textit{(0.07)} & 45.48\% \textit{(0.07)} & 42.0\% \textit{(0.04)} \\
[100] & 50.54\% \textit{(0.0)} & 47.15\% \textit{(0.08)} & 53.95\% \textit{(0.07)} & 35.51\% \textit{(0.05)} \\
[60, 120] & 50.4\% \textit{(0.0)} & 49.59\% \textit{(0.08)} & 51.21\% \textit{(0.08)} & 35.54\% \textit{(0.05)} \\
[80, 50] & 50.68\% \textit{(0.0)} & 51.98\% \textit{(0.1)} & 49.37\% \textit{(0.1)} & 36.93\% \textit{(0.07)} \\
[80, 100] & 50.34\% \textit{(0.0)} & 51.99\% \textit{(0.13)} & 48.69\% \textit{(0.13)} & 36.53\% \textit{(0.09)} \\
[100, 40] & 50.68\% \textit{(0.0)} & 47.42\% \textit{(0.11)} & 53.96\% \textit{(0.11)} & 33.57\% \textit{(0.07)} \\
[20] & 51.2\%  & 88.26\%  & 15.03\%  & 64.12\% & [579 571; 77 101] \\
[40] & 49.32\%  & 98.32\% 

### Ensembles

In [115]:
include("functions.jl")

dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => typemax(Int))
knnParameters = Dict("modelType" => :kNN, "numNeighboors" => 3)
svmParameters = Dict("modelType" => :SVM, "kernel" => "rbf", "C" => 10)
Random.seed!(42)

ensemble_types = [:VotingHard, :Stacking]
final_estimators = [dtParameters, knnParameters, svmParameters]

for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metricsCV = trainClassEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_balanced_binary_output), crossValidationIndexes; ensembleType = ensemble_type, final_estimator = final_estimator)
        metricsCV["topology"] = final_estimator
        generate_latex_table(metricsCV, false)

        if ensemble_type == :VotingHard
            break
        end

    end
end


for ensemble_type in ensemble_types
    for final_estimator in final_estimators
        metrics = createAndTrainFinalEnsemble([:DecisionTree, :kNN, :SVM], [dtParameters, knnParameters, svmParameters], (train_input, train_balanced_binary_output), (test_input, test_balanced_binary_output); ensembleType = ensemble_type, final_estimator = final_estimator)
        metrics["topology"] = final_estimator
        generate_latex_table(metrics, true)

        if ensemble_type == :VotingHard
            break
        end
    end
end


Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 92.32\% \textit{(0.01)} & 84.69\% \textit{(0.01)} & 100.0\% \textit{(0.0)} & 91.7\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 95.9\% \textit{(0.01)} & 97.82\% \textit{(0.01)} & 93.96\% \textit{(0.03)} & 96.0\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :kNN, "numNeighboors" => 3) & 98.19\% \textit{(0.01)} & 97.75\% \textit{(0.01)} & 98.64\% \textit{(0.01)} & 98.19\% \textit{(0.01)} \\
Dict{String, Any}("modelType" => :SVM, "C" => 10, "kernel" => "rbf") & 98.87\% \textit{(0.0)} & 97.75\% \textit{(0.01)} & 100.0\% \textit{(0.0)} & 98.86\% \textit{(0.0)} \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 93.83\%  & 87.5\%  & 100.0\%  & 93.33\% & [574 0; 82 672] \\
Dict{String, Any}("modelType" => :DecisionTree, "maxDepth" => 9223372036854775807) & 94.28\%  & 97.87\%  & 90.77\%  & 94.41\% & [642 62; 

## Third approach: multiclass classification

In [29]:
using MLDataPattern;
Random.seed!(42)
X_bal, y_bal = oversample((input_data', output_data), shuffle = true)
X_bal = getobs(X_bal)'
y_bal = getobs(y_bal)

5900-element PooledArrays.PooledVector{String31, UInt32, Vector{UInt32}}:
 "Benign"
 "Android_Adware"
 "Android_Scareware"
 "Android_Scareware"
 "Android_SMS_Malware"
 "Benign"
 "Android_Adware"
 "Android_SMS_Malware"
 "Android_SMS_Malware"
 "Android_Adware"
 "Android_Scareware"
 "Benign"
 "Benign"
 ⋮
 "Benign"
 "Benign"
 "Android_Adware"
 "Android_Adware"
 "Android_SMS_Malware"
 "Android_SMS_Malware"
 "Android_Scareware"
 "Android_Scareware"
 "Android_Scareware"
 "Android_SMS_Malware"
 "Android_Adware"
 "Android_Adware"

In [30]:
countmap(y_bal)

Dict{String31, Int64} with 4 entries:
  "Benign"              => 1475
  "Android_Scareware"   => 1475
  "Android_SMS_Malware" => 1475
  "Android_Adware"      => 1475

In [25]:
Random.seed!(42)

train_indexes, test_indexes = holdOut(size(input_data, 1), 0.2)

train_input = convert(Array{Float32, 2}, input_data[train_indexes, :])
train_output = output_data[train_indexes]

normalizationParameters = calculateMinMaxNormalizationParameters(train_input)

normalizeMinMax!(train_input, normalizationParameters)

test_input = convert(Array{Float32, 2}, input_data[test_indexes, :])
test_output = output_data[test_indexes]

normalizeMinMax!(test_input, normalizationParameters)

@assert size(test_input, 1) == size(test_output, 1)
@assert size(train_input, 1) == size(train_output, 1)

In [26]:
Random.seed!(42)

kFolds = 10
crossValidationIndexes = crossvalidation(train_output, kFolds);

In [175]:
include("functions.jl")
dtParameters = Dict("modelType" => :DecisionTree, "maxDepth" => 1)

depths = [3, 5, 7, 10, 15, typemax(Int)]
for depth in depths
    dtParameters["maxDepth"] = depth
    metricsCV = (modelCrossValidation(dtParameters["modelType"], dtParameters, train_input, train_output, crossValidationIndexes))
    metricsCV["topology"] = depth

    generate_latex_table(metricsCV, false)

end

println("----------------------------------------------------------------")

for depth in depths
    dtParameters["maxDepth"] = depth
    metrics = createAndTrainFinalModel(dtParameters["modelType"], dtParameters, train_input, train_output, test_input, test_output)
    metrics["topology"] = depth

    generate_latex_table(metrics, true)

end


LoadError: BoundsError: attempt to access 2845×70 Matrix{Float32} at index [5312-element BitVector, 1:70]

In [16]:
@sk_import decomposition:PCA


PyObject <class 'sklearn.decomposition._pca.PCA'>

In [17]:

# pcas = 1:20:4
pca = PCA(2)

#Ajust the matrix acording to the train data
fit!(pca, train_input)

#Once it is ajusted it can be used to transform the data
pca_train = pca.transform(train_input)
pca_test = pca.transform(test_input)

@assert (size(train_input)[1],2) == size(pca_train)
@assert (size(test_input)[1],2) == size(pca_test)

In [24]:
pcas = [2, 6, 10, 15, 20, 25, 30]

7-element Vector{Int64}:
  2
  6
 10
 15
 20
 25
 30