In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("access-logs") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 2) \
    .config("spark.dynamicAllocation.minExecutors", 0) \
    .config("spark.dynamicAllocation.maxExecutors", 6) \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/19 16:28:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.functions import regexp_extract, col, to_timestamp
from pyspark.sql.types import LongType, TimestampType

import re
import os
import time
from tqdm import tqdm

In [3]:
log_file_path = "/opt/workspace/datasets/web-logs/access.log"

combined_regex = r'^(\S+) \S+ (\S+) \[([^\]]+)\] "([A-Z]+) ([^ "]+)? HTTP/[0-9.]+" ([0-9]{3}) ([0-9]+|-) "([^"]*)" "([^"]*)'
columns = ['client', 'userid', 'datetime', 'method', 'request', 'status', 'size', 'referer', 'user_agent']

In [4]:
web_logs = spark.read.text(log_file_path)

In [21]:
web_logs_raw = web_logs.select(
    regexp_extract("value", combined_regex, 1).alias(columns[0]),
    regexp_extract("value", combined_regex, 2).alias(columns[1]),
    regexp_extract("value", combined_regex, 3).alias(columns[2]),
    regexp_extract("value", combined_regex, 4).alias(columns[3]),
    regexp_extract("value", combined_regex, 5).alias(columns[4]),
    regexp_extract("value", combined_regex, 6).alias(columns[5]),
    regexp_extract("value", combined_regex, 7).alias(columns[6]),
    regexp_extract("value", combined_regex, 8).alias(columns[7]),
    regexp_extract("value", combined_regex, 9).alias(columns[8]),
)

In [20]:
web_logs_raw.show(n=5, truncate=False)

+-------------+------+--------------------------+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----+-----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
|client       |userid|datetime                  |method|request                                                                                                                                                               |status|size |referer                            |user_agent                                                                                                                                    |
+-------------+------+--------------------------+------+----------------------------------------------------------------------------------------------------------------

In [7]:
web_logs_raw.count()

                                                                                

10365152

In [22]:
condition = col(columns[0]) == ''
for column in columns[1:]:
    condition &= col(column) == ''
    
print("Num empty rows: ", web_logs_raw.filter(condition).count())

web_logs_df =  web_logs_raw.filter(~condition)



Num empty rows:  287


                                                                                

In [8]:
condition = col(columns[0]) == ''
for column in columns[1:]:
    condition &= col(column) == ''

In [9]:
web_logs_raw.filter(condition).count()

                                                                                

287

In [10]:
web_logs_df =  web_logs_raw.filter(~condition)

In [11]:
web_logs_df.count()

                                                                                

10364865

In [12]:
web_logs_df.select("datetime").filter("client = '54.36.149.41'").limit(1).show(truncate=False)

+--------------------------+
|datetime                  |
+--------------------------+
|22/Jan/2019:03:56:14 +0330|
+--------------------------+



In [13]:
web_logs_df.printSchema()

root
 |-- client: string (nullable = true)
 |-- userid: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- method: string (nullable = true)
 |-- request: string (nullable = true)
 |-- status: string (nullable = true)
 |-- size: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- user_agent: string (nullable = true)



In [14]:
web_logs_df = web_logs_df.withColumn("status", col("status").cast(LongType())) \
            .withColumn("size", col("size").cast(LongType())) \
            .withColumn("datetime", to_timestamp(col("datetime"), 'dd/MMM/yyyy:HH:mm:ss x'))

In [15]:
web_logs_df.printSchema()

root
 |-- client: string (nullable = true)
 |-- userid: string (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- method: string (nullable = true)
 |-- request: string (nullable = true)
 |-- status: long (nullable = true)
 |-- size: long (nullable = true)
 |-- referer: string (nullable = true)
 |-- user_agent: string (nullable = true)



In [16]:
web_logs_df.show()

+-------------+------+-------------------+------+--------------------+------+-----+--------------------+--------------------+
|       client|userid|           datetime|method|             request|status| size|             referer|          user_agent|
+-------------+------+-------------------+------+--------------------+------+-----+--------------------+--------------------+
| 54.36.149.41|     -|2019-01-22 00:26:14|   GET|/filter/27|13%20%...|   200|30577|                   -|Mozilla/5.0 (comp...|
|  31.56.96.51|     -|2019-01-22 00:26:16|   GET|/image/60844/prod...|   200| 5667|https://www.zanbi...|Mozilla/5.0 (Linu...|
|  31.56.96.51|     -|2019-01-22 00:26:16|   GET|/image/61474/prod...|   200| 5379|https://www.zanbi...|Mozilla/5.0 (Linu...|
|40.77.167.129|     -|2019-01-22 00:26:17|   GET|/image/14925/prod...|   200| 1696|                   -|Mozilla/5.0 (comp...|
|  91.99.72.15|     -|2019-01-22 00:26:17|   GET|/product/31893/62...|   200|41483|                   -|Mozilla/5.0 (W

In [17]:
web_logs_df.orderBy("datetime", ascending=False).show(n=1)



+-------------+------+-------------------+------+--------------------+------+----+--------------------+--------------------+
|       client|userid|           datetime|method|             request|status|size|             referer|          user_agent|
+-------------+------+-------------------+------+--------------------+------+----+--------------------+--------------------+
|192.15.51.231|     -|2019-01-26 16:59:13|   GET|/image/267/produc...|   200|3423|https://www.zanbi...|Mozilla/5.0 (Linu...|
+-------------+------+-------------------+------+--------------------+------+----+--------------------+--------------------+
only showing top 1 row



                                                                                

In [18]:
web_logs_df.orderBy("datetime", ascending=True).show(n=1)



+------------+------+-------------------+------+--------------------+------+-----+-------+--------------------+
|      client|userid|           datetime|method|             request|status| size|referer|          user_agent|
+------------+------+-------------------+------+--------------------+------+-----+-------+--------------------+
|54.36.149.41|     -|2019-01-22 00:26:14|   GET|/filter/27|13%20%...|   200|30577|      -|Mozilla/5.0 (comp...|
+------------+------+-------------------+------+--------------------+------+-----+-------+--------------------+
only showing top 1 row



                                                                                