In [14]:
from pyspark import SparkContext, SparkConf

# Initializing Spark
conf=SparkConf().setAppName("KDDCup_Analytics").setMaster("local[*]")
sc=SparkContext(conf=conf)

In [1]:
# How many cores in the current machine?
sc.defaultParallelism

4

In [2]:
# importing data into the environment using its URL
# Source: http://kdd.ics.uci.edu/databases/kddcup99/
import urllib.request
urllib.request.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz", "kddcup.data.gz")

('kddcup.data.gz', <http.client.HTTPMessage at 0x7fb308b2bbe0>)

In [3]:
# Read file into RDD
KDDcup_rdd = sc.textFile("./kddcup.data.gz", 8)

In [4]:
# Take 5 samples to understand date
KDDcup_rdd.take(5)

['0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.00,0.00,0.00,0.00,1.00,0.00,0.00,1,1,1.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,2,2,1.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.00,0.00,0.00,0.00,1.00,0.00,0.00,3,3,1.00,0.00,0.33,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.00,0.00,0.00,0.00,1.00,0.00,0.00,4,4,1.00,0.00,0.25,0.00,0.00,0.00,0.00,0.00,normal.']

In [5]:
# Count the number of records
KDDcup_rdd.count()

4898431

In [6]:
# Count the number of "normal" connections
Normal_rdd = KDDcup_rdd.filter(lambda line: 'normal.' in line)
Normal_rdd.count()

972781

In [7]:
# List the name of labels
Split_rdd = KDDcup_rdd.map(lambda line: line.split(","))
Label_rdd = Split_rdd.map(lambda item: item[-1]).distinct() # Get last index for "labels"
Label_rdd.collect()

['normal.',
 'buffer_overflow.',
 'loadmodule.',
 'perl.',
 'neptune.',
 'smurf.',
 'guess_passwd.',
 'pod.',
 'teardrop.',
 'portsweep.',
 'ipsweep.',
 'land.',
 'ftp_write.',
 'back.',
 'imap.',
 'satan.',
 'phf.',
 'nmap.',
 'multihop.',
 'warezmaster.',
 'warezclient.',
 'spy.',
 'rootkit.']

In [8]:
# List the name of protocols
Protocol_rdd = Split_rdd.map(lambda item: item[1]).distinct() # Get index 1
Protocol_rdd.collect()

['tcp', 'udp', 'icmp']