### *.Preparar o ambiente

In [2]:
# import libraries
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from operator import add

In [3]:
# Spark configurations
conf = (SparkConf()
        .setMaster("local")
        .setAppName("nasa")
        .set("spark.executor.memory", "4g"))

In [4]:
# initialize SparkContext
sc = SparkContext(conf=conf)

### *. Carregar arquivos

In [5]:
# load file access_log_Jul95
jul = sc.textFile("access_log_Jul95")
jul = jul.cache()

In [6]:
# load file access_log_Aug95
aug= sc.textFile("access_log_Aug95")
aug = aug.cache()

### 1. Número de hosts únicos

In [8]:
def count_unique_host(rdd, period):
    try:
        hosts = rdd.flatMap(lambda i: i.split(" ")[0]).distinct().count()  
    except ValueError as e:
        print(e)
    return print("Number of unique hosts on {}: {}.".format(period, hosts))

In [9]:
count_unique_host(jul, "july")

Number of unique hosts on july: 55.


In [10]:
count_unique_host(aug, "august")

Number of unique hosts on august: 53.


### 2. O total de erros 404

In [11]:
def count_error_404(rdd, period):
    try:
        def url_404(line):
            try:
                url = line.split(" ")[-2]
                if url == "404":
                    return True
            except:
                pass
            return False
        count = rdd.filter(url_404).cache()
        count_404 = count.count()
    except ValueError as e:
        print(e)
    return print("Total number of 404 erros on {}: {}.".format(period, count_404))

In [12]:
count_error_404(jul, "july")

Total number of 404 erros on july: 10845.


In [13]:
count_error_404(aug, "august")

Total number of 404 erros on august: 10056.


### 3. Os 5 URLs que mais causaram erro 404

In [14]:
def url_404(rdd):
    try:
        url = rdd.split(" ")[-2]
        if url == "404":
            return True
    except ValueError as e:
        print(e)
    return False

In [15]:
jul_404 = jul.filter(url_404).cache()

In [16]:
aug_404 = aug.filter(url_404).cache()

In [17]:
def top5_url_error_404(rdd_url_404):
    try:
        url_404 = rdd_url_404.map(lambda line: line.split('"')[1].split(' ')[1])
        count_404 = url_404.map(lambda i: (i, 1)).reduceByKey(add)
        top_5 = count_404.sortBy(lambda i: -i[1]).take(5)
        print("Top 5 url with most frequent 404 errors:")
        for url_404, count_404 in top_5:
            print("{}: {}".format(url_404, count_404))
        return top_5
    except ValueError as e:
        print(e)
    return False

In [18]:
top5_url_error_404(jul_404)

Top 5 url with most frequent 404 errors:
/pub/winvn/readme.txt: 667
/pub/winvn/release.txt: 547
/history/apollo/apollo-13.html: 286
/shuttle/resources/orbiters/atlantis.gif: 232
/history/apollo/a-001/a-001-patch-small.gif: 230


[('/pub/winvn/readme.txt', 667),
 ('/pub/winvn/release.txt', 547),
 ('/history/apollo/apollo-13.html', 286),
 ('/shuttle/resources/orbiters/atlantis.gif', 232),
 ('/history/apollo/a-001/a-001-patch-small.gif', 230)]

In [19]:
top5_url_error_404(aug_404)

Top 5 url with most frequent 404 errors:
/pub/winvn/readme.txt: 1337
/pub/winvn/release.txt: 1185
/shuttle/missions/STS-69/mission-STS-69.html: 683
/images/nasa-logo.gif: 319
/shuttle/missions/sts-68/ksc-upclose.gif: 253


[('/pub/winvn/readme.txt', 1337),
 ('/pub/winvn/release.txt', 1185),
 ('/shuttle/missions/STS-69/mission-STS-69.html', 683),
 ('/images/nasa-logo.gif', 319),
 ('/shuttle/missions/sts-68/ksc-upclose.gif', 253)]

### 4. Quantidade de erros 404 por dia

In [20]:
def error_404_daily(rdd_url_404, period):
    try:
        days = rdd_url_404.map(lambda i: i.split("[")[1].split(":")[0])
        counts = days.map(lambda day: (day, 1)).reduceByKey(add).collect()
        print("Number of errors 404 per day on {}:".format(period))
        for day, count in counts:
            print("{}: {}".format(day, count))
        return counts
    except ValueError as e:
        print(e)
    return False       

In [21]:
error_404_daily(jul_404, "july")

Number of errors 404 per day on july:
13/Jul/1995: 532
21/Jul/1995: 334
25/Jul/1995: 461
09/Jul/1995: 348
15/Jul/1995: 254
16/Jul/1995: 257
18/Jul/1995: 465
17/Jul/1995: 406
07/Jul/1995: 570
12/Jul/1995: 471
19/Jul/1995: 639
22/Jul/1995: 192
23/Jul/1995: 233
03/Jul/1995: 474
05/Jul/1995: 497
10/Jul/1995: 398
14/Jul/1995: 413
01/Jul/1995: 316
02/Jul/1995: 291
04/Jul/1995: 359
06/Jul/1995: 640
08/Jul/1995: 302
11/Jul/1995: 471
20/Jul/1995: 428
24/Jul/1995: 328
26/Jul/1995: 336
27/Jul/1995: 336
28/Jul/1995: 94


[('13/Jul/1995', 532),
 ('21/Jul/1995', 334),
 ('25/Jul/1995', 461),
 ('09/Jul/1995', 348),
 ('15/Jul/1995', 254),
 ('16/Jul/1995', 257),
 ('18/Jul/1995', 465),
 ('17/Jul/1995', 406),
 ('07/Jul/1995', 570),
 ('12/Jul/1995', 471),
 ('19/Jul/1995', 639),
 ('22/Jul/1995', 192),
 ('23/Jul/1995', 233),
 ('03/Jul/1995', 474),
 ('05/Jul/1995', 497),
 ('10/Jul/1995', 398),
 ('14/Jul/1995', 413),
 ('01/Jul/1995', 316),
 ('02/Jul/1995', 291),
 ('04/Jul/1995', 359),
 ('06/Jul/1995', 640),
 ('08/Jul/1995', 302),
 ('11/Jul/1995', 471),
 ('20/Jul/1995', 428),
 ('24/Jul/1995', 328),
 ('26/Jul/1995', 336),
 ('27/Jul/1995', 336),
 ('28/Jul/1995', 94)]

In [22]:
error_404_daily(aug_404, "august")

Number of errors 404 per day on august:
01/Aug/1995: 243
07/Aug/1995: 537
09/Aug/1995: 279
10/Aug/1995: 315
21/Aug/1995: 305
27/Aug/1995: 370
30/Aug/1995: 571
03/Aug/1995: 304
06/Aug/1995: 373
08/Aug/1995: 391
16/Aug/1995: 259
20/Aug/1995: 312
05/Aug/1995: 236
11/Aug/1995: 263
12/Aug/1995: 196
13/Aug/1995: 216
15/Aug/1995: 327
17/Aug/1995: 271
22/Aug/1995: 288
23/Aug/1995: 345
24/Aug/1995: 420
26/Aug/1995: 366
28/Aug/1995: 410
14/Aug/1995: 287
18/Aug/1995: 256
19/Aug/1995: 209
31/Aug/1995: 526
04/Aug/1995: 346
25/Aug/1995: 415
29/Aug/1995: 420


[('01/Aug/1995', 243),
 ('07/Aug/1995', 537),
 ('09/Aug/1995', 279),
 ('10/Aug/1995', 315),
 ('21/Aug/1995', 305),
 ('27/Aug/1995', 370),
 ('30/Aug/1995', 571),
 ('03/Aug/1995', 304),
 ('06/Aug/1995', 373),
 ('08/Aug/1995', 391),
 ('16/Aug/1995', 259),
 ('20/Aug/1995', 312),
 ('05/Aug/1995', 236),
 ('11/Aug/1995', 263),
 ('12/Aug/1995', 196),
 ('13/Aug/1995', 216),
 ('15/Aug/1995', 327),
 ('17/Aug/1995', 271),
 ('22/Aug/1995', 288),
 ('23/Aug/1995', 345),
 ('24/Aug/1995', 420),
 ('26/Aug/1995', 366),
 ('28/Aug/1995', 410),
 ('14/Aug/1995', 287),
 ('18/Aug/1995', 256),
 ('19/Aug/1995', 209),
 ('31/Aug/1995', 526),
 ('04/Aug/1995', 346),
 ('25/Aug/1995', 415),
 ('29/Aug/1995', 420)]

### 5. O total de bytes retornados

In [23]:
def size_file_bytes(rdd, period):
    try:
        def count_byte(line):
            try:
                count = int(line.split(" ")[-1])
                if count < 0:
                    raise ValueError()
                return count
            except:
                return 0
        count = rdd.map(count_byte).reduce(lambda x, y: x + y)
        return print("Number of bytes of {} file: {}".format(period, count))
    except ValueError as e:
        print(e)
    return ""

In [24]:
size_file_bytes(jul, "july")

Number of bytes of july file: 38695973491


In [25]:
size_file_bytes(aug, "august")

Number of bytes of august file: 26828341424


### *.Fim

In [26]:
sc.stop()