In [1]:
# Parse an Apache access log. Assumes Python 3
import re
from pyspark.sql import Row
from datetime import datetime

APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+) "((?:[^”]|”)+)" "((?:[^”]|”)+)"$'
DATETIME_PARSE_PATTERN = '%d/%b/%Y:%H:%M:%S %z'

# Returns a Row containing the Apache Access Log info
def parse_apache_log_line(logline):
    match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)
    if match is None:
        return None
    date_obj = datetime.strptime(match.group(4),DATETIME_PARSE_PATTERN)
    return Row(
        ipAddress    = match.group(1),
        clientIdentd = match.group(2),
        userId       = match.group(3),
        dateTime     = match.group(4),
        timestamp    = date_obj.timestamp(),
        month        = date_obj.strftime('%Y-%m'),
        method       = match.group(5),
        endpoint     = match.group(6),
        protocol     = match.group(7),
        referrer     = match.group(10),
        userAgent    = match.group(11),
        responseCode = int(match.group(8)),
        contentSize  = int(match.group(9)))

In [2]:
access_logs_raw = sc.textFile("hdfs://master:9000/user/michael/data/diybigdata.20160808.log")

In [3]:
access_logs = access_logs_raw.map(parse_apache_log_line).filter(lambda x: x is not None)

In [4]:
accoss_logs_df = access_logs.toDF()

In [5]:
accoss_logs_df.printSchema()

root
 |-- clientIdentd: string (nullable = true)
 |-- contentSize: long (nullable = true)
 |-- dateTime: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- ipAddress: string (nullable = true)
 |-- method: string (nullable = true)
 |-- month: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- referrer: string (nullable = true)
 |-- responseCode: long (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [6]:
accoss_logs_df.write.partitionBy(
        "month"
    ).parquet(
        "hdfs://master:9000/user/michael/data/diybigdata.20160808.parquet",
        mode='overwrite'
    )