In [8]:
import re
from datetime import datetime

LOG_RE = re.compile(r'''
    (?P<host>\S+)\s+                # host (or -)
    (?P<ident>\S+)\s+               # ident (or -)
    \[(?P<time>[^\]]+)\]\s+         # [timestamp]
    "(?P<method>\S+)\s+(?P<path>[^"]+?)\s+(?P<proto>[^"]+)"\s+  # "METHOD path PROTO"
    (?P<status>\d{3})\s+            # status
    (?P<size>\S+)\s+                # size in bytes or -
    "(?P<referer>[^"]*)"\s+         # "referer" (may be empty)
    (?P<token>\S+)\s+               # custom token (session id or cookie)
    "(?P<agent>[^"]+)"              # "user-agent"
''', re.VERBOSE)

def parse_apache_line(line):
    m = LOG_RE.search(line)
    if not m:
        raise ValueError("line did not match expected format")
    d = m.groupdict()
    # print(m)

    # parse time to datetime
    # example time format: 01/Nov/2019:16:32:09 +0000
    dt = datetime.strptime(d['time'], "%d/%b/%Y:%H:%M:%S %z")

    # normalize size
    size = None if d['size'] == '-' else int(d['size'])

    # Optional minimal UA parsing (heuristic)
    

    

    parsed = {
        'timestamp': dt,
        'method': d['method'],
        'path': d['path'],
        'protocol': d['proto'],
        'status': int(d['status']),
        'size': size,
        'referer': None if d['referer'] == '' else d['referer'],
        'token': d['token'],
    }
    return parsed

# Example usage:
line = '- - [01/Nov/2019:16:32:09 +0000] "POST /storage/store_sess_total_mousemv_db.php HTTP/1.1" 200 449 "https://160.40.52.164/content/big_data.php" htodnmm7tjpihgeuqk64c0gjes "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.61"'
line2 = '- - [30/Oct/2019:09:42:02 +0000] "GET / HTTP/1.0" 200 2045 "-" - "-"'
line3 = '- - [30/Oct/2019:11:50:36 +0000] "GET / HTTP/1.1" 200 2770 "-" - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.61"'
print(parse_apache_line(line3))


{'timestamp': datetime.datetime(2019, 10, 30, 11, 50, 36, tzinfo=datetime.timezone.utc), 'method': 'GET', 'path': '/', 'protocol': 'HTTP/1.1', 'status': 200, 'size': 2770, 'referer': '-', 'token': '-'}
