In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Process RDD Logs").getOrCreate()
spark

In [2]:
import os
os.cpu_count()

32

# Apache Access Logs

In [3]:
rdd_2004 = spark.sparkContext.textFile("data/linux_2004/log/httpd/access_log*", 1)
rdd_2004.count()

38487

In [4]:
rdd_2005 = spark.sparkContext.textFile("data/linux_2005/log/httpd/access_log*", 1)
rdd_2005.count()

5936

In [5]:
rdd_2006 = spark.sparkContext.textFile("data/linux_2006/log/httpd/access_log*", 1)
rdd_2006.count()

36310

In [6]:
rdd_access = spark.sparkContext.textFile("data/apache/access.log")
rdd_access.count()

6016792

## Combine into single DataFrame

In [7]:
rdd = spark.sparkContext.union([rdd_access, rdd_2004, rdd_2005, rdd_2006])
total = rdd.count()
total

6097525

In [8]:
rdd.getNumPartitions()

108

# EDA

In [9]:
rdd_distinct = rdd.distinct()
total_distinct = rdd_distinct.count()
total_distinct

4256761

In [10]:
sample_rdd = rdd.sample(False, fraction=10/total)
print(sample_rdd.count())

10


In [11]:
sample = sample_rdd.collect()

## Using Regular Expressions for extracting the fields of the log as columns

In [12]:
import re
from pyspark.sql import functions as F, types as T

In [13]:
log_fields = ('IP', 'UserIdentity', 'Username', 'Timestamp', 'Request', 'StatusCode', 
              'Size(bytes)', 'Referrer', 'UserAgent', 'Unknown')
pattern = r'(.+) (.+) (.+) \[(.+)\] "(.+)" (\d+) (\d+) "(.+?)" "(.+?)" ?(".+")?'

#### Some re pattern explanations
<pre>
(.+?) - non-greedy matching
(".+")? - optional group
</pre>

In [14]:
def extract(string):
    matched = re.match(pattern, string)
    if matched:
        return matched.groups()
    return ('')*len(log_fields)

## Using map transformation in RDD

In [17]:
%%time
matched_rdd = rdd.map(extract)
matched_rdd.count()

CPU times: user 14 ms, sys: 4.93 ms, total: 18.9 ms
Wall time: 1min 18s


6097525

In [18]:
%%time
filtered_rdd = matched_rdd.filter(lambda row: row is not ('')*len(log_fields))
filtered_rdd.count()

CPU times: user 16.9 ms, sys: 5.15 ms, total: 22 ms
Wall time: 1min 52s


6084776