# Q1 - Daily and monthly statistics
For each sensor, compute the minimum, average and maximum values of the two sensor metrics. Produce results for each day.

* MapReduce (Mr. Job)
* Spark Core

In [1]:
# @title Mount Google Drive (Optional)
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Map Reduce 

In [2]:
!cp '/content/drive/MyDrive/2020-01/sds011-2020-01-01.csv' day_1.csv
!pip install mrjob --quiet
!wget -q -O /etc/mrjob.conf https://raw.githubusercontent.com/smduarte/spbd-2223/main/lab2/mrjob.conf

[?25l[K     |▊                               | 10 kB 26.9 MB/s eta 0:00:01[K     |█▌                              | 20 kB 30.4 MB/s eta 0:00:01[K     |██▎                             | 30 kB 36.9 MB/s eta 0:00:01[K     |███                             | 40 kB 18.5 MB/s eta 0:00:01[K     |███▊                            | 51 kB 18.9 MB/s eta 0:00:01[K     |████▌                           | 61 kB 21.5 MB/s eta 0:00:01[K     |█████▏                          | 71 kB 12.1 MB/s eta 0:00:01[K     |██████                          | 81 kB 12.9 MB/s eta 0:00:01[K     |██████▊                         | 92 kB 14.2 MB/s eta 0:00:01[K     |███████▌                        | 102 kB 15.4 MB/s eta 0:00:01[K     |████████▏                       | 112 kB 15.4 MB/s eta 0:00:01[K     |█████████                       | 122 kB 15.4 MB/s eta 0:00:01[K     |█████████▊                      | 133 kB 15.4 MB/s eta 0:00:01[K     |██████████▍                     | 143 kB 15.4 MB/s eta 0:

In [3]:
!wc day_1.csv

  3290317   3290317 194835673 day_1.csv


In [5]:
%%file sensor_stats.py

from statistics import * 
from mrjob.job import MRJob, MRStep
import numpy as np

class MRSensorStats(MRJob):

  def mapper(self, _, line):
    # We strip data line and separate values by sep=;
    # Then we select all the fields that we need
    vals = line.strip().split(';')
    sensor = vals[0] 
    metric_1 = float(vals[5]) # ensure the value will be a float
    metric_2 = float(vals[6]) # ensure the value will be a float
    yield sensor, (metric_1, metric_2)

  def reducer(self, sensor, values):
    # We need to convert into a list of tuples all the 
    # values of our metrics in order to get the single
    # values to compute the operations (min, max, mean)
    vals = list(zip(*values)) 
    
    metric_1 = list(vals[0]) # list for the values of metric 1
    metric_2 = list(vals[1]) # list for the values of metric 2
    yield sensor, (min(metric_1), min(metric_2), np.mean(metric_1), np.mean(metric_2), max(metric_1), max(metric_2))
  
      
if __name__ == '__main__':
    MRSensorStats.run()

Overwriting sensor_stats.py


In [6]:
%%shell
rm -rf results
python -m sensor_stats --output-dir results --cleanup NONE day_1.csv && head results/*

Using configs in /etc/mrjob.conf
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/sensor_stats.root.20221123.164228.815732
job output is in results
==> results/part-00000 <==
"1000"	[9.47, 8.17, 27.262298850574716, 21.99685344827586, 341.53, 275.53]
"10009"	[16.57, 11.37, 36.141113013698636, 21.538407534246577, 77.23, 45.33]
"10011"	[22.83, 13.1, 66.72313253012048, 29.71526678141136, 325.23, 100.83]
"10017"	[41.0, 25.0, 88.66052132701424, 37.248056872037914, 260.7, 77.73]
"10029"	[0.3, 0.3, 1.6616637168141593, 1.3946194690265485, 15.37, 7.83]
"10035"	[7.3, 4.87, 48.52620689655173, 38.71199233716475, 501.25, 447.08]
"10039"	[15.33, 10.23, 57.28409090909091, 34.483181818181826, 167.2, 92.33]
"1004"	[8.83, 7.4, 119.68597955706984, 72.39698466780239, 637.47, 455.2]
"10041"	[44.63, 20.67, 81.0609785202864, 34.961455847255365, 147.33, 57.27]
"10056"	[2.1, 1.8, 19.31492385786802, 12.663489847715736, 191.23, 124.13]

==> results/part-00001 <==
"18503"	



# Q1 Spark Core

In [7]:
# @title Install Pyspark
!pip install --quiet pyspark

[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[K     |████████████████████████████████| 199 kB 52.1 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [22]:
import pyspark
from operator import *
import numpy as np

sc = pyspark.SparkContext("local[*]")
try:
    # We strip the file in order to be able to treat each line
    lines = sc.textFile("day_1.csv").map(lambda line: line.strip())

    sensors = (
        lines.map(lambda line: line.split(";"))
        .filter(lambda values: len(values) == 7)
        .map(lambda values: (values[0], float(values[5]), float(values[6])))
        .map(lambda kv: (kv[0], (kv[1], kv[2], kv[1], kv[2], 1, kv[1], kv[2])))
        .reduceByKey(
            lambda a, b: (
                min(a[0], b[0]),
                min(a[1], b[1]),
                a[2] + b[2],
                a[3] + b[3],
                a[4] + b[4],
                max(a[5], b[5]),
                max(a[6], b[6]),
            )
        )
        .map(
            lambda kv: (
                kv[0],
                (
                    kv[1][0],
                    kv[1][1],
                    kv[1][2] / kv[1][4],
                    kv[1][3] / kv[1][4],
                    kv[1][5],
                    kv[1][6],
                ),
            )
        )
        .sortByKey()
    )
    # We divide the previous tuple in Key and values (Sensor, and values)
    # Values stored to do the next (min metric_1, min metric_2, mean m1, mean m2 counter to divide and compute the mean, max m1, max m2)
    # We reduce and get values for each sensor. Compute min m1, min m2, sum m1, sum m2, counter of the sensor, max m1, max m2
    # retrieve the values we computed. Sensor Id, min m1, min m2, mean m1, mean m2, max m1, max m1
    for sensor in sensors.take(100):
        print(sensor)

    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

('1000', (9.47, 8.17, 27.26229885057471, 21.996853448275864, 341.53, 275.53))
('10009', (16.57, 11.37, 36.14111301369863, 21.538407534246577, 77.23, 45.33))
('10011', (22.83, 13.1, 66.72313253012048, 29.71526678141136, 325.23, 100.83))
('10017', (41.0, 25.0, 88.66052132701424, 37.24805687203791, 260.7, 77.73))
('10029', (0.3, 0.3, 1.6616637168141593, 1.3946194690265483, 15.37, 7.83))
('10035', (7.3, 4.87, 48.52620689655173, 38.71199233716476, 501.25, 447.08))
('10039', (15.33, 10.23, 57.28409090909092, 34.48318181818181, 167.2, 92.33))
('1004', (8.83, 7.4, 119.68597955706981, 72.39698466780241, 637.47, 455.2))
('10041', (44.63, 20.67, 81.0609785202864, 34.961455847255365, 147.33, 57.27))
('10056', (2.1, 1.8, 19.314923857868017, 12.663489847715734, 191.23, 124.13))
('10059', (23.63, 15.67, 58.429828326180264, 31.333369098712446, 206.17, 113.4))
('10063', (0.1, 0.1, 0.305, 0.2730081300813008, 3.2, 1.2))
('10069', (31.43, 17.1, 103.03190053285964, 47.94619893428063, 720.35, 502.02))
('100