In [1]:
import os
import time
import pyspark
import numpy as np

In [2]:
FILE_NAME = 'household_power_consumption.txt'

In [3]:
sc = pyspark.SparkContext(appName='hw1_code', master='spark://spark-master:7077')
text_file = sc.textFile(os.path.join('../', 'dataset', FILE_NAME))

In [4]:
take_size = 3
attr_idx = [2, 3, 4, 5]
bc_attr_idx = sc.broadcast(attr_idx)
bc_attr_idx.value, sc.defaultParallelism

([2, 3, 4, 5], 2)

In [5]:
# show data
text_file.take(take_size)

['Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_metering_1;Sub_metering_2;Sub_metering_3',
 '16/12/2006;17:24:00;4.216;0.418;234.840;18.400;0.000;1.000;17.000',
 '16/12/2006;17:25:00;5.360;0.436;233.630;23.000;0.000;1.000;16.000']

In [6]:
def trans_type(row):
    try:
        return np.double(row)
    except:
        return None

def get_attrs(row_gen):
    for row in row_gen:
        attrs = row.split(';')
        attrs = np.take(attrs, bc_attr_idx.value)
        attrs = trans_type(attrs)
        if attrs is not None:
            yield (attrs, attrs, 1) # (current_min_value, current_max_value, count)

text_file.repartition(sc.defaultParallelism).mapPartitions(get_attrs).take(take_size)

[(array([  3.662,   0.51 , 233.86 ,  15.8  ]),
  array([  3.662,   0.51 , 233.86 ,  15.8  ]),
  1),
 (array([  4.448,   0.498, 232.86 ,  19.6  ]),
  array([  4.448,   0.498, 232.86 ,  19.6  ]),
  1),
 (array([  5.412,   0.47 , 232.78 ,  23.2  ]),
  array([  5.412,   0.47 , 232.78 ,  23.2  ]),
  1)]

## (1) minimun, maximum, count

In [7]:
%%time
def task1_reduce(a, b):
    a_min, a_max, a_count = a
    b_min, b_max, b_count = b
    _min = np.nanmin([a_min, b_min], axis=0)
    _max = np.nanmax([a_max, b_max], axis=0)
    return _min, _max, a_count+b_count

task1_result = text_file.repartition(sc.defaultParallelism).mapPartitions(get_attrs).reduce(task1_reduce)

CPU times: user 28.9 ms, sys: 1.72 ms, total: 30.6 ms
Wall time: 44.3 s


In [8]:
print('Global_active_power;Global_reactive_power;Voltage;Global_intensity'.split(';'))
print('min :', task1_result[0])
print('max :', task1_result[1])
print('count :', task1_result[2])

['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']
min : [7.600e-02 0.000e+00 2.232e+02 2.000e-01]
max : [ 11.122   1.39  254.15   48.4  ]
count : 2049280


## (2) mean, standard

In [9]:
%%time
def get_attrs_2(row_gen):
    for row in row_gen:
        attrs = row.split(';')
        attrs = np.take(attrs, bc_attr_idx.value)
        attrs = trans_type(attrs)
        if attrs is not None:
            yield (attrs, 1)

def add_attrs(a, b):
    a_attrs, a_count = a
    b_attrs, b_count = b
    return np.sum([a_attrs, b_attrs], axis=0), a_count + b_count

sum_count_pair = text_file.repartition(sc.defaultParallelism).mapPartitions(get_attrs_2).reduce(add_attrs)
mean = sum_count_pair[0]/sum_count_pair[1]
bc_mean = sc.broadcast(mean)

CPU times: user 19.5 ms, sys: 3.29 ms, total: 22.8 ms
Wall time: 18.6 s


In [10]:
%%time

def minus_mean(row_gen):
    for row in row_gen:
        attrs, count = row
        attrs -= bc_mean.value
        yield np.power(attrs, 2)


summed = text_file.repartition(sc.defaultParallelism).mapPartitions(get_attrs_2, preservesPartitioning=True) \
         .mapPartitions(minus_mean, preservesPartitioning=True).sum()
population_std = np.sqrt(summed / sum_count_pair[1])
sample_std = np.sqrt(summed / (sum_count_pair[1] - 1))

CPU times: user 21.7 ms, sys: 11.1 ms, total: 32.8 ms
Wall time: 14.1 s


In [11]:
print('Global_active_power;Global_reactive_power;Voltage;Global_intensity'.split(';'))
print('mean : ', mean)
print('std (population) : ', population_std)
print('std (sample) : ', sample_std)

['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']
mean :  [1.09161504e+00 1.23714476e-01 2.40839858e+02 4.62775931e+00]
std (population) :  [1.0572939  0.11272195 3.23998589 4.44439518]
std (sample) :  [1.05729416 0.11272198 3.23998668 4.44439626]


## (3) min-max normalization

In [12]:
bc_min = sc.broadcast(task1_result[0])
bc_max = sc.broadcast(task1_result[1])

def min_max_norm(row_gen):
    for row in row_gen:
        attrs, count = row
        yield (attrs-bc_min.value)/(bc_max.value-bc_min.value)
print('Global_active_power;Global_reactive_power;Voltage;Global_intensity'.split(';'))
text_file.mapPartitions(get_attrs_2, preservesPartitioning=True) \
         .mapPartitions(min_max_norm, preservesPartitioning=True).take(20)

['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']


[array([0.37479631, 0.30071942, 0.37609047, 0.37759336]),
 array([0.47836321, 0.31366906, 0.33699515, 0.47302905]),
 array([0.47963064, 0.35827338, 0.32600969, 0.47302905]),
 array([0.48089806, 0.36115108, 0.34054927, 0.47302905]),
 array([0.32500453, 0.37985612, 0.40323102, 0.32365145]),
 array([0.31178707, 0.37553957, 0.3819063 , 0.30705394]),
 array([0.32826362, 0.37410072, 0.38416801, 0.32365145]),
 array([0.32808256, 0.37410072, 0.38836834, 0.32365145]),
 array([0.32518559, 0.36690647, 0.34862682, 0.32365145]),
 array([0.3246424 , 0.36690647, 0.34442649, 0.32365145]),
 array([0.39579938, 0.35827338, 0.31211632, 0.40248963]),
 array([0.48307079, 0.3381295 , 0.3095315 , 0.47717842]),
 array([0.46605106, 0.34388489, 0.31631664, 0.46058091]),
 array([0.4700344 , 0.28633094, 0.31373183, 0.46473029]),
 array([0.36013036, 0.30359712, 0.38901454, 0.36099585]),
 array([0.29947492, 0.2028777 , 0.45040388, 0.29045643]),
 array([0.28915445, 0.10935252, 0.4371567 , 0.28215768]),
 array([0.3036

In [13]:
print('Global_active_power;Global_reactive_power;Voltage;Global_intensity'.split(';'))
print('min :', task1_result[0])
print('max :', task1_result[1])
print('mean : ', mean)
print('std (population) : ', population_std)
print('std (sample) : ', sample_std)
print('count :', task1_result[2])

['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']
min : [7.600e-02 0.000e+00 2.232e+02 2.000e-01]
max : [ 11.122   1.39  254.15   48.4  ]
mean :  [1.09161504e+00 1.23714476e-01 2.40839858e+02 4.62775931e+00]
std (population) :  [1.0572939  0.11272195 3.23998589 4.44439518]
std (sample) :  [1.05729416 0.11272198 3.23998668 4.44439626]
count : 2049280
