In [1]:
import findspark
findspark.init()
import sys
from pyspark import SparkContext,SparkConf

In [2]:
master = "local"
if len(sys.argv) == 2:
    master = sys.argv[1]
conf = SparkConf().setMaster(master).setAppName("rdd-sampling")
sc = SparkContext(conf=conf)

In [3]:
data_file_path = "../data/kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file_path)

In [6]:
# 查看数据文件中的数据组成
raw_data.take(5)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']

In [8]:
# 对数据进行抽样
raw_data_sample = raw_data.sample(False,0.1,1234) # 表示是否用随机值替换，取样比例，随机种子数
sample_size = raw_data_sample.count()
total_size = raw_data.count()
print("Sample size is {} of {}".format(sample_size,total_size))

Sample size is 49493 of 494021


In [11]:
from time import time
raw_data_sample_items = raw_data_sample.flatMap(lambda x:x.split(","))
sample_normal_items = raw_data_sample_items.filter(lambda x:"normal." in x)
t0 = time()
sample_normal_items_count = sample_normal_items.count()
t1 = time()-t0
sample_normal_ratio = sample_normal_items_count / float(sample_size)

print("The ratio of 'normal' interactions is {}".format(round(sample_normal_ratio,3)))
print("Count done in {} seconds".format(round(t1,3)))

The ratio of 'normal' interactions is 0.195
Count done in 1.925 seconds


In [12]:
raw_data_items = raw_data.flatMap(lambda x:x.split(","))
normal_items = raw_data_items.filter(lambda x:"normal." in x)

t0 = time()
normal_items_count = normal_items.count()
t1 = time() - t0

normal_item_ratio = normal_items_count / float(total_size)

print("The radio of 'normal' is {}".format(round(normal_item_ratio,3)))
print("Count done in {} seconds".format(round(t1,3)))

The radio of 'normal' is 0.197
Count done in 9.077 seconds


In [17]:
# takeSample算子进行抽样
# 不是指定抽样比例，而是指定抽取样本的数量
t0 = time()
raw_data_sample = raw_data.takeSample(False,100000,1234)
# type(raw_data_sample)  # list
normal_data_sample = [x.split(",") for x in raw_data_sample if "normal." in x]
t1 = time() - t0
normal_sample_size = len(normal_data_sample)
normal_ratio = normal_sample_size / float(100000)

print("The radio of 'normal' is {}".format(round(normal_ratio,3)))
print("Count done in {} seconds".format(round(t1,3)))

The radio of 'normal' is 0.196
Count done in 3.054 seconds
