# Introduction

Here is a list of cybersecurity datasets with a brief introduction about them.

In [13]:
library (dplyr)
library(plyr)

In [14]:
# Define a function that outputs a quick data quality report.

data_report <- function(df) {
  
  #DataFrame with column names and their data types
  data_types <- data.frame('data_type'= sapply(df, class))

  #DataFrame with Count
  data_count <- data.frame('count' = colSums(!is.na(df)))

  #DataFrame with unique values
  unique_value_counts <- data.frame('unique_values'= sapply(df, function(x) length(unique(x))))
 
  #Dataframe with number of missing values for each column
  missing_data_counts <- data.frame('missing_values'= sapply(df, function(x) sum(length(which(is.na(x))))))
  
  #Combine all dataframes
  data_quality_report <- cbind(data_types, data_count, unique_value_counts, missing_data_counts )
  print('Data Quality Report')

  return(data_quality_report)
}

                                                             
### 1. KDD Dataset

One of the most popular datasets for building a network intrusion detector and contains a great number of intrusions simulated in a military network environment.

http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [61]:
kdd = read.csv('kddcup.data.corrected.csv', header = FALSE)
print("Shape of Dataset:")
print(dim(kdd))

[1] "Shape of Dataset:"
[1] 4898431      42


In [62]:
str(kdd)

'data.frame':	4898431 obs. of  42 variables:
 $ V1 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V2 : Factor w/ 3 levels "icmp","tcp","udp": 2 2 2 2 2 2 2 2 2 2 ...
 $ V3 : Factor w/ 70 levels "aol","auth","bgp",..: 22 22 22 22 22 22 22 22 22 22 ...
 $ V4 : Factor w/ 11 levels "OTH","REJ","RSTO",..: 10 10 10 10 10 10 10 10 10 10 ...
 $ V5 : int  215 162 236 233 239 238 235 234 239 181 ...
 $ V6 : int  45076 4528 1228 2032 486 1282 1337 1364 1295 5450 ...
 $ V7 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V8 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V9 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V10: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V11: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V12: int  1 1 1 1 1 1 1 1 1 1 ...
 $ V13: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V14: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V15: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V16: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V17: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V18: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V19: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V20: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V21: int  0 0 0 0 0 0 0 0 0 0 ..

In [63]:
head(kdd, 5)

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42
0,tcp,http,SF,215,45076,0,0,0,0,⋯,0,0,0,0.0,0,0,0,0,0,normal.
0,tcp,http,SF,162,4528,0,0,0,0,⋯,1,1,0,1.0,0,0,0,0,0,normal.
0,tcp,http,SF,236,1228,0,0,0,0,⋯,2,1,0,0.5,0,0,0,0,0,normal.
0,tcp,http,SF,233,2032,0,0,0,0,⋯,3,1,0,0.33,0,0,0,0,0,normal.
0,tcp,http,SF,239,486,0,0,0,0,⋯,4,1,0,0.25,0,0,0,0,0,normal.


In [64]:
#Call 'data_report' function for the dataset
data_report(kdd)

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
V1,integer,4898431,9883,0
V2,factor,4898431,3,0
V3,factor,4898431,70,0
V4,factor,4898431,11,0
V5,integer,4898431,7195,0
V6,integer,4898431,21493,0
V7,integer,4898431,2,0
V8,integer,4898431,3,0
V9,integer,4898431,6,0
V10,integer,4898431,30,0


In [65]:
#Create a dataframe of numeric columns
numeric_columns <- select_if(kdd, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

V1,V5,V6,V7,V8,V9,V10,V11,V12,V13,⋯,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41
0,215,45076,0,0,0,0,0,1,0,⋯,0,0,0,0,0.0,0,0,0,0,0
0,162,4528,0,0,0,0,0,1,0,⋯,1,1,1,0,1.0,0,0,0,0,0
0,236,1228,0,0,0,0,0,1,0,⋯,2,2,1,0,0.5,0,0,0,0,0
0,233,2032,0,0,0,0,0,1,0,⋯,3,3,1,0,0.33,0,0,0,0,0
0,239,486,0,0,0,0,0,1,0,⋯,4,4,1,0,0.25,0,0,0,0,0
0,238,1282,0,0,0,0,0,1,0,⋯,5,5,1,0,0.2,0,0,0,0,0


In [66]:
#Create a dataframe of categorical columns
categorical_columns <- select_if(kdd, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

V2,V3,V4,V42
tcp,http,SF,normal.
tcp,http,SF,normal.
tcp,http,SF,normal.
tcp,http,SF,normal.
tcp,http,SF,normal.
tcp,http,SF,normal.


In [67]:
#The frequency of different levels of each categorical column
count(categorical_columns, 'V2')
count(categorical_columns, 'V3')
count(categorical_columns, 'V4')
count(categorical_columns, 'V42')

V2,freq
icmp,2833545
tcp,1870598
udp,194288


V3,freq
aol,2
auth,3382
bgp,1047
courier,1021
csnet_ns,1051
ctf,1068
daytime,1056
discard,1059
domain,1113
domain_u,57782


V4,freq
OTH,57
REJ,268874
RSTO,5344
RSTOS0,122
RSTR,8094
S0,869829
S1,532
S2,161
S3,50
SF,3744328


V42,freq
back.,2203
buffer_overflow.,30
ftp_write.,8
guess_passwd.,53
imap.,12
ipsweep.,12481
land.,21
loadmodule.,9
multihop.,7
neptune.,1072017


### 2. NSL-KDD Dataset

This dataset is an improvement over KDD dataset and is created with the aim of overcoming some of the problems that existed in KDD dataset. Having a considerable number of records for train and test data, makes this dataset a good choice for experiments.

https://www.unb.ca/cic/datasets/nsl.html

In [68]:
nsl_kdd = read.table('KDDTrain+_20Percent.txt',sep=",", header = FALSE )
print("Shape of Dataset:")
print(dim(nsl_kdd))

[1] "Shape of Dataset:"
[1] 25192    43


In [69]:
str(nsl_kdd)

'data.frame':	25192 obs. of  43 variables:
 $ V1 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V2 : Factor w/ 3 levels "icmp","tcp","udp": 2 3 2 2 2 2 2 2 2 2 ...
 $ V3 : Factor w/ 66 levels "auth","bgp","courier",..: 17 40 45 20 20 45 45 45 47 45 ...
 $ V4 : Factor w/ 11 levels "OTH","REJ","RSTO",..: 10 10 6 10 10 2 6 6 6 6 ...
 $ V5 : int  491 146 0 232 199 0 0 0 0 0 ...
 $ V6 : int  0 0 0 8153 420 0 0 0 0 0 ...
 $ V7 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V8 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V9 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ V10: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V11: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V12: int  0 0 0 1 1 0 0 0 0 0 ...
 $ V13: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V14: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V15: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V16: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V17: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V18: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V19: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V20: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V21: int  0 0 0 0 0 0 0 0 0 0 ...
 $ V22: int  0 0 0 0 0 0 0 0 0 0 ...
 $

In [70]:
head(nsl_kdd, 5)

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43
0,tcp,ftp_data,SF,491,0,0,0,0,0,⋯,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
0,udp,other,SF,146,0,0,0,0,0,⋯,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
0,tcp,private,S0,0,0,0,0,0,0,⋯,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
0,tcp,http,SF,232,8153,0,0,0,0,⋯,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
0,tcp,http,SF,199,420,0,0,0,0,⋯,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [71]:
#Call 'data_report' function for the dataset
data_report(nsl_kdd)

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
V1,integer,25192,758,0
V2,factor,25192,3,0
V3,factor,25192,66,0
V4,factor,25192,11,0
V5,integer,25192,1665,0
V6,integer,25192,3922,0
V7,integer,25192,2,0
V8,integer,25192,3,0
V9,integer,25192,2,0
V10,integer,25192,22,0


In [72]:
#Create a dataframe of numeric columns
numeric_columns <- select_if(nsl_kdd, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

V1,V5,V6,V7,V8,V9,V10,V11,V12,V13,⋯,V33,V34,V35,V36,V37,V38,V39,V40,V41,V43
0,491,0,0,0,0,0,0,0,0,⋯,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,20
0,146,0,0,0,0,0,0,0,0,⋯,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,15
0,0,0,0,0,0,0,0,0,0,⋯,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,19
0,232,8153,0,0,0,0,0,1,0,⋯,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,21
0,199,420,0,0,0,0,0,1,0,⋯,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
0,0,0,0,0,0,0,0,0,0,⋯,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,21


In [73]:
                                                             
#Create a dataframe of categorical columns
categorical_columns <- select_if(nsl_kdd, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

V2,V3,V4,V42
tcp,ftp_data,SF,normal
udp,other,SF,normal
tcp,private,S0,neptune
tcp,http,SF,normal
tcp,http,SF,normal
tcp,private,REJ,neptune


In [74]:
                                                             
#The frequency of different levels of each categorical column
count(categorical_columns, 'V2')
count(categorical_columns, 'V3')
count(categorical_columns, 'V4')
count(categorical_columns, 'V42')

V2,freq
icmp,1655
tcp,20526
udp,3011


V3,freq
auth,189
bgp,146
courier,164
csnet_ns,111
ctf,127
daytime,107
discard,105
domain,109
domain_u,1820
echo,65


V4,freq
OTH,5
REJ,2216
RSTO,304
RSTOS0,21
RSTR,497
S0,7009
S1,88
S2,21
S3,15
SF,14973


V42,freq
back,196
buffer_overflow,6
ftp_write,1
guess_passwd,10
imap,5
ipsweep,710
land,1
loadmodule,1
multihop,2
neptune,8282


### 3. Credit Card Fraud

The highly unbalanced dataset belongs to 2013 transactions of European cardholders. The features in this dataset are the result of PCA transformation and we do not have any information about the actual features.

This is a labeled dataset.

https://www.kaggle.com/samkirkiles/credit-card-fraud/data

In [75]:
credit_data = read.csv('creditcard.csv')
print("Shape of Dataset:")
print(dim(credit_data))

[1] "Shape of Dataset:"
[1] 284807     31


In [76]:
str(credit_data)

'data.frame':	284807 obs. of  31 variables:
 $ Time  : num  0 0 1 1 2 2 4 7 7 9 ...
 $ V1    : num  -1.36 1.192 -1.358 -0.966 -1.158 ...
 $ V2    : num  -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
 $ V3    : num  2.536 0.166 1.773 1.793 1.549 ...
 $ V4    : num  1.378 0.448 0.38 -0.863 0.403 ...
 $ V5    : num  -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
 $ V6    : num  0.4624 -0.0824 1.8005 1.2472 0.0959 ...
 $ V7    : num  0.2396 -0.0788 0.7915 0.2376 0.5929 ...
 $ V8    : num  0.0987 0.0851 0.2477 0.3774 -0.2705 ...
 $ V9    : num  0.364 -0.255 -1.515 -1.387 0.818 ...
 $ V10   : num  0.0908 -0.167 0.2076 -0.055 0.7531 ...
 $ V11   : num  -0.552 1.613 0.625 -0.226 -0.823 ...
 $ V12   : num  -0.6178 1.0652 0.0661 0.1782 0.5382 ...
 $ V13   : num  -0.991 0.489 0.717 0.508 1.346 ...
 $ V14   : num  -0.311 -0.144 -0.166 -0.288 -1.12 ...
 $ V15   : num  1.468 0.636 2.346 -0.631 0.175 ...
 $ V16   : num  -0.47 0.464 -2.89 -1.06 -0.451 ...
 $ V17   : num  0.208 -0.115 1.11 -0.684 -0.237 ...
 $ V

In [77]:
head(credit_data, 5)

Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,⋯,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.3598071,-0.07278117,2.5363467,1.3781552,-0.33832077,0.46238778,0.23959855,0.0986979,0.363787,⋯,-0.018306778,0.277837576,-0.1104739,0.06692807,0.1285394,-0.1891148,0.133558377,-0.02105305,149.62,0
0,1.1918571,0.26615071,0.1664801,0.4481541,0.06001765,-0.08236081,-0.07880298,0.08510165,-0.2554251,⋯,-0.225775248,-0.638671953,0.101288,-0.33984648,0.1671704,0.1258945,-0.008983099,0.01472417,2.69,0
1,-1.3583541,-1.34016307,1.7732093,0.3797796,-0.50319813,1.80049938,0.79146096,0.24767579,-1.5146543,⋯,0.247998153,0.771679402,0.9094123,-0.68928096,-0.3276418,-0.1390966,-0.055352794,-0.05975184,378.66,0
1,-0.9662717,-0.18522601,1.7929933,-0.8632913,-0.01030888,1.24720317,0.23760894,0.37743587,-1.3870241,⋯,-0.108300452,0.005273597,-0.1903205,-1.17557533,0.647376,-0.2219288,0.062722849,0.06145763,123.5,0
2,-1.1582331,0.87773675,1.5487178,0.4030339,-0.40719338,0.09592146,0.59294075,-0.27053268,0.8177393,⋯,-0.009430697,0.798278495,-0.1374581,0.14126698,-0.2060096,0.5022922,0.21942223,0.21515315,69.99,0


In [78]:
                                                             
#Call 'data_report' function for the dataset
data_report(credit_data)

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
Time,numeric,284807,124592,0
V1,numeric,284807,275663,0
V2,numeric,284807,275663,0
V3,numeric,284807,275663,0
V4,numeric,284807,275663,0
V5,numeric,284807,275663,0
V6,numeric,284807,275663,0
V7,numeric,284807,275663,0
V8,numeric,284807,275663,0
V9,numeric,284807,275663,0


In [79]:
                                                                                                                         
#Create a dataframe of numeric columns
numeric_columns <- select_if(credit_data, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,⋯,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.3598071,-0.07278117,2.5363467,1.3781552,-0.33832077,0.46238778,0.23959855,0.0986979,0.363787,⋯,-0.018306778,0.277837576,-0.11047391,0.06692807,0.1285394,-0.1891148,0.133558377,-0.02105305,149.62,0
0,1.1918571,0.26615071,0.1664801,0.4481541,0.06001765,-0.08236081,-0.07880298,0.08510165,-0.2554251,⋯,-0.225775248,-0.638671953,0.10128802,-0.33984648,0.1671704,0.1258945,-0.008983099,0.01472417,2.69,0
1,-1.3583541,-1.34016307,1.7732093,0.3797796,-0.50319813,1.80049938,0.79146096,0.24767579,-1.5146543,⋯,0.247998153,0.771679402,0.90941226,-0.68928096,-0.3276418,-0.1390966,-0.055352794,-0.05975184,378.66,0
1,-0.9662717,-0.18522601,1.7929933,-0.8632913,-0.01030888,1.24720317,0.23760894,0.37743587,-1.3870241,⋯,-0.108300452,0.005273597,-0.19032052,-1.17557533,0.647376,-0.2219288,0.062722849,0.06145763,123.5,0
2,-1.1582331,0.87773675,1.5487178,0.4030339,-0.40719338,0.09592146,0.59294075,-0.27053268,0.8177393,⋯,-0.009430697,0.798278495,-0.13745808,0.14126698,-0.2060096,0.5022922,0.21942223,0.21515315,69.99,0
2,-0.4259659,0.96052304,1.1411093,-0.1682521,0.42098688,-0.02972755,0.47620095,0.26031433,-0.5686714,⋯,-0.208253515,-0.559824796,-0.02639767,-0.37142658,-0.2327938,0.1059148,0.253844225,0.08108026,3.67,0


In [80]:
                                                             
#Create a dataframe of categorical columns
categorical_columns <- select_if(credit_data, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

In [81]:
                                                             
#The frequency of different levels of each categorical column
count(credit_data, 'Class')

Class,freq
0,284315
1,492


### 4. DDS Dataset Collection

There are two datasets, one gives the information about AWS honeypots and the second one adds some information about the geolocation.

http://datadrivensecurity.info/blog/pages/dds-dataset-collection.html

In [82]:
marx = read.csv('marx.csv')
print("Shape of Dataset:")
print(dim(marx))

[1] "Shape of Dataset:"
[1] 451581      7


In [83]:
str(marx)

'data.frame':	451581 obs. of  7 variables:
 $ datetime: Factor w/ 380323 levels "2013-03-03 21:53:59",..: 1 2 4 3 5 6 7 8 9 10 ...
 $ host    : Factor w/ 9 levels "groucho-eu","groucho-norcal",..: 3 3 3 8 5 7 3 5 3 5 ...
 $ src     : num  1.03e+09 1.35e+09 2.95e+09 8.42e+08 3.59e+09 ...
 $ proto   : Factor w/ 3 levels "ICMP","TCP","UDP": 2 3 2 3 2 2 2 2 2 2 ...
 $ type    : int  NA NA NA NA NA NA NA NA NA NA ...
 $ spt     : int  6000 5270 2489 43235 56577 32628 6000 6000 6000 6000 ...
 $ dpt     : int  1433 5060 1080 1900 80 2323 1433 3306 1433 1433 ...


In [84]:
head(marx, 5)

datetime,host,src,proto,type,spt,dpt
2013-03-03 21:53:59,groucho-oregon,1032051418,TCP,,6000,1433
2013-03-03 21:57:01,groucho-oregon,1347834426,UDP,,5270,5060
2013-03-03 21:58:10,groucho-oregon,2947856490,TCP,,2489,1080
2013-03-03 21:58:09,groucho-us-east,841842716,UDP,,43235,1900
2013-03-03 21:58:20,groucho-singapore,3587648279,TCP,,56577,80


In [85]:
                                                             
#Call 'data_report' function for the dataset
data_report(marx)

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
datetime,factor,451581,380323,0
host,factor,451581,9,0
src,numeric,451581,69602,0
proto,factor,451581,3,0
type,integer,44811,8,406770
spt,integer,406770,46189,44811
dpt,integer,406770,4042,44811


In [86]:
                                                             
#Create a dataframe of numeric columns
numeric_columns <- select_if(marx, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

src,type,spt,dpt
1032051418,,6000,1433
1347834426,,5270,5060
2947856490,,2489,1080
841842716,,43235,1900
3587648279,,56577,80
3323217250,,32628,2323


In [87]:
                                                             
#Create a dataframe of categorical columns
categorical_columns <- select_if(marx, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

datetime,host,proto
2013-03-03 21:53:59,groucho-oregon,TCP
2013-03-03 21:57:01,groucho-oregon,UDP
2013-03-03 21:58:10,groucho-oregon,TCP
2013-03-03 21:58:09,groucho-us-east,UDP
2013-03-03 21:58:20,groucho-singapore,TCP
2013-03-03 21:58:41,groucho-tokyo,TCP


In [88]:
                                                             
#The frequency of different levels of each categorical column
count(categorical_columns, 'host')
count(categorical_columns, 'proto')
count(marx, 'type')

host,freq
groucho-eu,23954
groucho-norcal,24566
groucho-oregon,94076
groucho-sa,24316
groucho-singapore,78151
groucho-sydney,24456
groucho-tokyo,126189
groucho-us-east,31779
zeppo-norcal,24094


proto,freq
ICMP,44811
TCP,327991
UDP,78779


type,freq
0.0,536
3.0,4251
5.0,127
8.0,38597
11.0,1156
12.0,2
13.0,142
,406770


In [89]:
                                                             
marx_geo = read.csv('marx-geo.csv')
print("Shape of Dataset:")
print(dim(marx_geo))

[1] "Shape of Dataset:"
[1] 451581     16


In [90]:
str(marx_geo)

'data.frame':	451581 obs. of  16 variables:
 $ datetime  : Factor w/ 185118 levels "3/10/13 0:00",..: 17291 17292 17293 17293 17293 17293 17294 17295 17296 17297 ...
 $ host      : Factor w/ 9 levels "groucho-eu","groucho-norcal",..: 3 3 3 8 5 7 3 5 3 5 ...
 $ src       : num  1.03e+09 1.35e+09 2.95e+09 8.42e+08 3.59e+09 ...
 $ proto     : Factor w/ 3 levels "ICMP","TCP","UDP": 2 3 2 3 2 2 2 2 2 2 ...
 $ type      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ spt       : int  6000 5270 2489 43235 56577 32628 6000 6000 6000 6000 ...
 $ dpt       : int  1433 5060 1080 1900 80 2323 1433 3306 1433 1433 ...
 $ srcstr    : Factor w/ 69602 levels "1.0.0.38","1.1.162.110",..: 56491 62595 23980 52966 40813 34587 46361 45824 43150 55791 ...
 $ cc        : Factor w/ 177 levels "","AD","AE","AF",..: 36 43 161 165 56 165 36 36 36 36 ...
 $ country   : Factor w/ 178 levels "","Afghanistan",..: 38 59 159 171 56 171 38 38 38 38 ...
 $ locale    : Factor w/ 1180 levels "","Aargau","Abu Dhabi",..: 392 1 1

In [91]:
head(marx_geo, 5)

datetime,host,src,proto,type,spt,dpt,srcstr,cc,country,locale,localeabbr,postalcode,latitude,longitude,X
3/3/13 21:53,groucho-oregon,1032051418,TCP,,6000,1433,61.131.218.218,CN,China,Jiangxi Sheng,36,,28.55,115.9333,
3/3/13 21:57,groucho-oregon,1347834426,UDP,,5270,5060,80.86.82.58,DE,Germany,,,,51.0,9.0,
3/3/13 21:58,groucho-oregon,2947856490,TCP,,2489,1080,175.180.184.106,TW,Taiwan,Taipei,,,25.0392,121.525,
3/3/13 21:58,groucho-us-east,841842716,UDP,,43235,1900,50.45.128.28,US,United States,Oregon,OR,97124.0,45.5848,-122.9117,
3/3/13 21:58,groucho-singapore,3587648279,TCP,,56577,80,213.215.43.23,FR,France,,,,48.86,2.35,


In [92]:
                                                             
#Call 'data_report' function for the dataset
data_report(marx_geo)

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
datetime,factor,451581,185118,0
host,factor,451581,9,0
src,numeric,451581,69602,0
proto,factor,451581,3,0
type,integer,44811,8,406770
spt,integer,406770,46189,44811
dpt,integer,406770,4042,44811
srcstr,factor,451581,69602,0
cc,factor,451580,178,1
country,factor,451581,178,0


In [93]:
                                                             
#Create a dataframe of numeric columns
numeric_columns <- select_if(marx_geo, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

src,type,spt,dpt,latitude,longitude,X
1032051418,,6000,1433,28.55,115.9333,
1347834426,,5270,5060,51.0,9.0,
2947856490,,2489,1080,25.0392,121.525,
841842716,,43235,1900,45.5848,-122.9117,
3587648279,,56577,80,48.86,2.35,
3323217250,,32628,2323,41.8825,-87.6441,


In [94]:
                                                             
#Create a dataframe of categorical columns
categorical_columns <- select_if(marx_geo, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

datetime,host,proto,srcstr,cc,country,locale,localeabbr,postalcode
3/3/13 21:53,groucho-oregon,TCP,61.131.218.218,CN,China,Jiangxi Sheng,36,
3/3/13 21:57,groucho-oregon,UDP,80.86.82.58,DE,Germany,,,
3/3/13 21:58,groucho-oregon,TCP,175.180.184.106,TW,Taiwan,Taipei,,
3/3/13 21:58,groucho-us-east,UDP,50.45.128.28,US,United States,Oregon,OR,97124.0
3/3/13 21:58,groucho-singapore,TCP,213.215.43.23,FR,France,,,
3/3/13 21:58,groucho-tokyo,TCP,198.20.69.98,US,United States,Illinois,IL,60661.0


In [95]:
                                                             
#The frequency of different levels of each categorical column
count(categorical_columns, 'host')
count(categorical_columns, 'proto')
count(marx_geo, 'type')

host,freq
groucho-eu,23954
groucho-norcal,24566
groucho-oregon,94076
groucho-sa,24316
groucho-singapore,78151
groucho-sydney,24456
groucho-tokyo,126189
groucho-us-east,31779
zeppo-norcal,24094


proto,freq
ICMP,44811
TCP,327991
UDP,78779


type,freq
0.0,536
3.0,4251
5.0,127
8.0,38597
11.0,1156
12.0,2
13.0,142
,406770


### 5. UNSW-NB15 Dataset

The Australian Center for Cyber Security (ACCS) created this dataset that contains nine types of attack. Another dataset for network intrusion detection which can be compared with KDD dataset.

This is a labeled dataset.

https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/

In [20]:
unsw_data = read.csv('UNSW_NB15_training-set.csv')
print("Shape of Dataset:")
print(dim(unsw_data))

[1] "Shape of Dataset:"
[1] 82332    45


In [21]:
str(unsw_data)

'data.frame':	82332 obs. of  45 variables:
 $ id               : int  1 2 3 4 5 6 7 8 9 10 ...
 $ dur              : num  1.1e-05 8.0e-06 5.0e-06 6.0e-06 1.0e-05 3.0e-06 6.0e-06 2.8e-05 0.0 0.0 ...
 $ proto            : Factor w/ 131 levels "3pc","a/n","aes-sp3-d",..: 118 118 118 118 118 118 118 118 7 7 ...
 $ service          : Factor w/ 13 levels "-","dhcp","dns",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ state            : Factor w/ 7 levels "ACC","CLO","CON",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ spkts            : int  2 2 2 2 2 2 2 2 1 1 ...
 $ dpkts            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ sbytes           : int  496 1762 1068 900 2126 784 1960 1384 46 46 ...
 $ dbytes           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ rate             : num  90909 125000 200000 166667 100000 ...
 $ sttl             : int  254 254 254 254 254 254 254 254 0 0 ...
 $ dttl             : int  0 0 0 0 0 0 0 0 0 0 ...
 $ sload            : num  1.80e+08 8.81e+08 8.54e+08 6.00e+08 8.50e+08 ...
 $ dload            : num  0 0 

In [22]:
head(unsw_data, 5)


id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,⋯,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
1,1.1e-05,udp,-,INT,2,0,496,0,90909.09,⋯,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0,⋯,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.01,⋯,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.66,⋯,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0,⋯,1,3,0,0,0,2,3,0,Normal,0


In [23]:
                                                           
#Call 'data_report' function for the dataset
data_report(unsw_data)

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
id,integer,82332,82332,0
dur,numeric,82332,39888,0
proto,factor,82332,131,0
service,factor,82332,13,0
state,factor,82332,7,0
spkts,integer,82332,420,0
dpkts,integer,82332,436,0
sbytes,integer,82332,4489,0
dbytes,integer,82332,4034,0
rate,numeric,82332,40616,0


In [24]:
                                                             
#Create a dataframe of numeric columns
numeric_columns <- select_if(unsw_data, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,⋯,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
1,1.1e-05,2,0,496,0,90909.09,254,0,180363632,⋯,1,1,2,0,0,0,1,2,0,0
2,8e-06,2,0,1762,0,125000.0,254,0,881000000,⋯,1,1,2,0,0,0,1,2,0,0
3,5e-06,2,0,1068,0,200000.01,254,0,854400000,⋯,1,1,3,0,0,0,1,3,0,0
4,6e-06,2,0,900,0,166666.66,254,0,600000000,⋯,2,1,3,0,0,0,2,3,0,0
5,1e-05,2,0,2126,0,100000.0,254,0,850400000,⋯,2,1,3,0,0,0,2,3,0,0
6,3e-06,2,0,784,0,333333.32,254,0,1045333312,⋯,2,1,2,0,0,0,2,2,0,0


In [25]:
                                                             
#Create a dataframe of categorical columns
categorical_columns <- select_if(unsw_data, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

proto,service,state,attack_cat
udp,-,INT,Normal
udp,-,INT,Normal
udp,-,INT,Normal
udp,-,INT,Normal
udp,-,INT,Normal
udp,-,INT,Normal


In [26]:
                                                             
#The frequency of different levels of each categorical column
count(categorical_columns, 'proto')
count(categorical_columns, 'service')
count(categorical_columns, 'state')
count(categorical_columns, 'attack_cat')


proto,freq
3pc,32
a/n,32
aes-sp3-d,32
any,96
argus,33
aris,32
arp,987
ax.25,32
bbn-rcc,34
bna,32


service,freq
-,47153
dhcp,26
dns,21367
ftp,1552
ftp-data,1396
http,8287
irc,5
pop3,423
radius,9
smtp,1851


state,freq
ACC,4
CLO,1
CON,6982
FIN,39339
INT,34163
REQ,1842
RST,1


attack_cat,freq
Analysis,677
Backdoor,583
DoS,4089
Exploits,11132
Fuzzers,6062
Generic,18871
Normal,37000
Reconnaissance,3496
Shellcode,378
Worms,44


### 6. CSIC 2010 http dataset

The CSIC 2010 HTTP dataset includes web application penetration testing packets and is created with the goal of feature selection before working on classification.

This is a labeled dataset.

https://petescully.co.uk/research/csic-2010-http-dataset-in-csv-format-for-weka-analysis/

In [103]:
csic_data = read.csv('output_http_csic_2010.csv')
print("Shape of Dataset:")
print(dim(csic_data))

[1] "Shape of Dataset:"
[1] 223585     18


In [104]:
str(csic_data)

'data.frame':	223585 obs. of  18 variables:
 $ index         : int  0 0 0 0 0 1 1 1 1 1 ...
 $ method        : Factor w/ 3 levels "GET","POST","PUT": 1 1 1 1 1 1 1 1 1 1 ...
 $ url           : Factor w/ 1643 levels "http://localhost:8080.bak",..: 1212 1212 1212 1212 1212 1212 1212 1212 1212 1212 ...
 $ protocol      : Factor w/ 1 level "HTTP/1.1": 1 1 1 1 1 1 1 1 1 1 ...
 $ userAgent     : Factor w/ 1 level "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)": 1 1 1 1 1 1 1 1 1 1 ...
 $ pragma        : Factor w/ 1 level "no-cache": 1 1 1 1 1 1 1 1 1 1 ...
 $ cacheControl  : Factor w/ 1 level "no-cache": 1 1 1 1 1 1 1 1 1 1 ...
 $ accept        : Factor w/ 1 level "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5": 1 1 1 1 1 1 1 1 1 1 ...
 $ acceptEncoding: Factor w/ 1 level "x-gzip, x-deflate, gzip, deflate": 1 1 1 1 1 1 1 1 1 1 ...
 $ acceptCharset : Factor w/ 1 level "utf-8, utf-8;q=0.5, *;q=0.5": 1 1 1 1 1 1 1 1

In [105]:
head(csic_data, 5)

index,method,url,protocol,userAgent,pragma,cacheControl,accept,acceptEncoding,acceptCharset,acceptLanguage,host,connection,contentLength,contentType,cookie,payload,label
0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,id=2,anom
0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,nombre=Jam%F3n+Ib%E9rico,anom
0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,precio=85,anom
0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25,anom
0,GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,B1=A%F1adir+al+carrito,anom


In [106]:
                                                             
#Call 'data_report' function for the dataset
data_report(csic_data)


  

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
index,integer,223585,36000,0
method,factor,223585,3,0
url,factor,223585,1643,0
protocol,factor,223585,1,0
userAgent,factor,223585,1,0
pragma,factor,223585,1,0
cacheControl,factor,223585,1,0
accept,factor,223585,1,0
acceptEncoding,factor,223585,1,0
acceptCharset,factor,223585,1,0


In [107]:
                                                           
#Create a dataframe of numeric columns
numeric_columns <- select_if(csic_data, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

index
0
0
0
0
0
1


In [108]:
                                                             
#Create a dataframe of categorical columns
categorical_columns <- select_if(csic_data, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

 

method,url,protocol,userAgent,pragma,cacheControl,accept,acceptEncoding,acceptCharset,acceptLanguage,host,connection,contentLength,contentType,cookie,payload,label
GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,id=2,anom
GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,nombre=Jam%F3n+Ib%E9rico,anom
GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,precio=85,anom
GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25,anom
GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,B1=A%F1adir+al+carrito,anom
GET,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko),no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,close,,,JSESSIONID=F563B5262843F12ECAE41815ABDEEA54,id=2%2F,anom


In [109]:
                                                            
#The frequency of different levels of each categorical column
count(categorical_columns, 'method')
count(categorical_columns, 'host')
count(categorical_columns, 'label')

method,freq
GET,123450
POST,97942
PUT,2193


host,freq
localhost:8080,221392
localhost:9090,2193


label,freq
anom,119585
norm,104000


### 7. Malware apps from Drebin project

The dataset contains more than 200 features from malware and benign apps and is useful for developing and evaluating multilevel classification.

This is a labeled dataset.

https://figshare.com/articles/Android_malware_dataset_for_machine_learning_2/5854653/1

In [110]:
drebin_data = read.csv('drebin.csv')
print("Shape of Dataset:")
print(dim(drebin_data))

[1] "Shape of Dataset:"
[1] 15036   216


In [111]:
str(drebin_data)

'data.frame':	15036 obs. of  216 variables:
 $ transact                                       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ onServiceConnected                             : int  0 0 0 0 0 0 1 0 0 0 ...
 $ bindService                                    : int  0 0 0 0 0 0 1 0 0 0 ...
 $ attachInterface                                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ServiceConnection                              : int  0 0 0 0 0 0 1 0 0 0 ...
 $ android.os.Binder                              : int  0 0 0 0 0 0 1 0 0 0 ...
 $ SEND_SMS                                       : int  1 1 1 0 0 0 0 0 1 1 ...
 $ Ljava.lang.Class.getCanonicalName              : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Ljava.lang.Class.getMethods                    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Ljava.lang.Class.cast                          : int  0 0 0 1 0 0 0 1 0 0 ...
 $ Ljava.net.URLDecoder                           : int  0 0 0 1 1 0 0 1 0 0 ...
 $ android.content.pm.Signature                   : int  0 0 0 0 

In [112]:
head(drebin_data, 5)

transact,onServiceConnected,bindService,attachInterface,ServiceConnection,android.os.Binder,SEND_SMS,Ljava.lang.Class.getCanonicalName,Ljava.lang.Class.getMethods,Ljava.lang.Class.cast,⋯,READ_CONTACTS,DEVICE_POWER,HARDWARE_TEST,ACCESS_WIFI_STATE,WRITE_EXTERNAL_STORAGE,ACCESS_FINE_LOCATION,SET_WALLPAPER_HINTS,SET_PREFERRED_APPLICATIONS,WRITE_SECURE_SETTINGS,class
0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,1,0,0,0,0,S
0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,1,0,0,0,0,S
0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,S
0,0,0,0,0,0,0,0,0,1,⋯,0,0,0,1,1,1,0,0,0,S
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,1,0,0,0,S


In [113]:
                                                             
#Call 'data_report' function for the dataset
data_report(drebin_data)

[1] "Data Quality Report"


Unnamed: 0,data_type,count,unique_values,missing_values
transact,integer,15036,2,0
onServiceConnected,integer,15036,2,0
bindService,integer,15036,2,0
attachInterface,integer,15036,2,0
ServiceConnection,integer,15036,2,0
android.os.Binder,integer,15036,2,0
SEND_SMS,integer,15036,2,0
Ljava.lang.Class.getCanonicalName,integer,15036,2,0
Ljava.lang.Class.getMethods,integer,15036,2,0
Ljava.lang.Class.cast,integer,15036,2,0


In [114]:
                                                             
#Create a dataframe of numeric columns
numeric_columns <- select_if(drebin_data, is.numeric)
head(numeric_columns)
sprintf("Number of numerical columns: %i", ncol(numeric_columns))

transact,onServiceConnected,bindService,attachInterface,ServiceConnection,android.os.Binder,SEND_SMS,Ljava.lang.Class.getCanonicalName,Ljava.lang.Class.getMethods,Ljava.lang.Class.cast,⋯,SET_ORIENTATION,READ_CONTACTS,DEVICE_POWER,HARDWARE_TEST,ACCESS_WIFI_STATE,WRITE_EXTERNAL_STORAGE,ACCESS_FINE_LOCATION,SET_WALLPAPER_HINTS,SET_PREFERRED_APPLICATIONS,WRITE_SECURE_SETTINGS
0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,0,1,0,0,0,0
0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,0,1,0,0,0,0
0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,1,1,1,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,1,0,1,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,1,1,0,0,0,0


In [115]:
                                                             
#Create a dataframe of categorical columns
categorical_columns <- select_if(drebin_data, is.factor)
head(categorical_columns)
sprintf("Number of categorical columns: %i", ncol(categorical_columns))

TelephonyManager.getSimCountryIso,class
0,S
0,S
0,S
0,S
0,S
0,S


In [116]:
                                                             
#The frequency of different levels of each categorical column
count(categorical_columns, 'TelephonyManager.getSimCountryIso')
count(categorical_columns, 'class')

TelephonyManager.getSimCountryIso,freq
?,5
0,12508
1,2523


class,freq
B,9476
S,5560
