In [None]:
import pandas as pd
import re
from datetime import datetime


dataset = 'access-1year-sorted.log'

with open(dataset, 'r') as f:
    log_data = f.read()
total_lines = len(log_data.splitlines())

print(f"Total baris data: {total_lines:,}")
print(f"Ukuran data: {len(log_data) / (1024*1024):.2f} MB")

Total baris data: 56,190
Ukuran data: 9.90 MB


In [9]:

log_pattern = re.compile(
    r'(?P<ip>\S+) - - \[(?P<datetime>[^\]]+)\] '
    r'"(?P<method>\S*) ?(?P<uri>\S*)? ?(?P<protocol>[^"]*)?" '
    r'(?P<status>\d{3}) (?P<bytes>\d+|-) '
    r'"(?P<referrer>[^"]*)" "(?P<user_agent>[^"]*)"'
)

parsed_logs = []
failed_parsed_logs = 0  # Counter for failed lines

for line in log_data.splitlines():
    match = log_pattern.match(line)
    if match:
        row = match.groupdict()
        dt_str = row['datetime']
        try:
            dt_obj = datetime.strptime(dt_str, "%d/%b/%Y:%H:%M:%S %z")
        except Exception:
            dt_obj = pd.NaT
        # Pilih hanya field-field yang dibutuhkan
        parsed_row = {
            'ip': row.get('ip', ''),
            'timestamp': dt_obj,
            'request': f"{row.get('method','')} {row.get('uri','')} {row.get('protocol','')}".strip(),
            'status': int(row['status']) if row['status'].isdigit() else None,
            'size': int(row['bytes']) if row['bytes'].isdigit() else 0,
            'referer': row.get('referrer', ''),
            'user_agent': row.get('user_agent', '')
        }
        parsed_logs.append(parsed_row)
    else:
        failed_parsed_logs += 1

df_logs = pd.DataFrame(parsed_logs)

# Show sample
print(f"Berhasil parsing: {len(df_logs):,} baris")
print(f"Gagal parsing  : {failed_parsed_logs:,} baris")
print(f"Sukses rate    : {(len(df_logs) / total_lines) * 100:.2f}%")
print("Sample parsed log entries:")
display(df_logs.head())

Berhasil parsing: 56,069 baris
Gagal parsing  : 121 baris
Sukses rate    : 99.78%
Sample parsed log entries:


Unnamed: 0,ip,timestamp,request,status,size,referer,user_agent
0,185.244.104.2,2025-10-01 00:01:29+07:00,PROPFIND / HTTP/1.1,301,162,http://207.148.71.21:443/,-
1,165.227.100.212,2025-10-01 00:02:23+07:00,GET /robots.txt HTTP/1.0,400,248,-,xfa1
2,167.172.180.168,2025-10-01 00:03:57+07:00,GET /favicon.ico HTTP/1.1,404,146,http://207.148.71.21/,Mozilla/5.0 (X11; Linux x86_64; rv:137.0) Geck...
3,167.172.180.168,2025-10-01 00:03:57+07:00,GET / HTTP/1.1,404,146,-,Mozilla/5.0 (X11; Linux x86_64; rv:137.0) Geck...
4,79.124.58.198,2025-10-01 00:24:06+07:00,GET /?XDEBUG_SESSION_START=phpstorm HTTP/1.1,404,178,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...


In [10]:
# save to csv
df_logs.to_csv('access-1year-sorted.csv', index=False)
