In [1]:
import re
from datetime import datetime
import json
import geoip2.database



In [None]:
# === REGEX DEFINITIONS ===
ip_regex = re.compile(r'\d+\.\d+\.\d+\.\d+')

timestamp_authlog = re.compile(r'[A-Za-z]{3}\s+\d+\s\d+:\d+:\d+')
user_regex = re.compile(r'for (?:invalid user\s)?(\w+) from')
action_regex = re.compile(r'sshd\[\d+\]:\s(\w+\s\w+)')
port_regex = re.compile(r'port\s+(\d+)')
service_regex = re.compile(r'(\w+)$')

timestamp_nginx = re.compile(r'\[(\d{2}/[A-Za-z]{3}/\d{4}:\d{2}:\d{2}:\d{2})')
http_method_regex = re.compile(r'(GET|POST|DELETE|HEAD|OPTIONS|PATCH)')
status_regex = re.compile(r'"\s(\d{3})\s')
path_regex = re.compile(r'"[A-Z]+\s([^\s]+)\sHTTP')

log_pattern_fw = re.compile(
    r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+'
    r'(ALLOW|DENY)\s+'
    r'(TCP|UDP)\s+'
    r'(\d+\.\d+\.\d+\.\d+)\s+'
    r'(\d+)\s+->\s+'
    r'(\d+\.\d+\.\d+\.\d+)\s+'
    r'(\d+)'
)



In [None]:
# === PARSERS ===

def parse_authlog_line(line):
    parsed = {'log_type': 'authlog'}
    
    ts_match = timestamp_authlog.search(line)
    if ts_match:
        parsed['timestamp'] = ts_match.group().strip()
       
    # Extract IP Address
    if ip := ip_regex.search(line):
        parsed['src_ip'] = ip.group(0)
    if user := user_regex.search(line):
        parsed['user'] = user.group(1)
    if action := action_regex.search(line):
        parsed['action'] = action.group(1)
    if port := port_regex.search(line):
        parsed['dst_port'] = port.group(1)
    if service := service_regex.search(line):
        parsed['service'] = service.group(1)
   

    return parsed


def parse_nginx_line(line):
    parsed = {'log_type': 'nginx'}
    if ip := ip_regex.search(line):
        parsed['src_ip'] = ip.group()
    if ts := timestamp_nginx.search(line):
        try:
            dt = datetime.strptime(ts.group(1), "%d/%b/%Y:%H:%M:%S")
            parsed['timestamp'] = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
        except ValueError:
            parsed['timestamp'] = ts.group(1)
    if method := http_method_regex.search(line):
        parsed['http_method'] = method.group(1)
    if path := path_regex.search(line):
        parsed['path'] = path.group(1)
    if status := status_regex.search(line):
        parsed['status_code'] = status.group(1)

    return parsed


def parse_firewall_line(line):
    parsed = {'log_type': 'firewall'}
    match = log_pattern_fw.match(line.strip())
    if match:
        ts, action, protocol, src_ip, src_port, dst_ip, dst_port = match.groups()
        parsed['timestamp'] = datetime.strptime(ts, "%Y-%m-%d %H:%M:%S").isoformat() + 'Z'
        parsed.update({
            'action': action,
            'protocol': protocol,
            'src_ip': src_ip,
            'src_port': int(src_port),
            'dst_ip': dst_ip,
            'dst_port': int(dst_port)
        })
    return parsed


def parse_file(filepath, parser_func):
    parsed_logs = []
    with open(filepath, 'r') as file:
        for line in file:
            if line.strip():  # Skip empty lines
                parsed_logs.append(parser_func(line))
    return parsed_logs



In [None]:
# === MAIN PROCESS ===

auth_logs = parse_file('inputs/authlog.txt', parse_authlog_line)
nginx_logs = parse_file('inputs/nginx.txt', parse_nginx_line)
firewall_logs = parse_file('inputs/firewall.txt', parse_firewall_line)

# Combine all logs
combined_logs = auth_logs + nginx_logs + firewall_logs

# Save separate JSONs
with open('outputs/parsed_authlog.json', 'w') as f:
    json.dump(auth_logs, f, indent=2)

with open('outputs/parsed_nginx.json', 'w') as f:
    json.dump(nginx_logs, f, indent=2)

with open('outputs/parsed_firewall.json', 'w') as f:
    json.dump(firewall_logs, f, indent=2)


# Harmonize keys : Ensuring that all keys are present in every dictionary
all_keys = set()
for log in combined_logs:
    all_keys.update(log.keys())
print(all_keys)

for log in combined_logs:
    for key in all_keys:
        if key not in log:
            log[key] = None

with open('outputs/combined_logs.json', 'w') as f:
    json.dump(combined_logs, f, indent=2)




In [None]:
# === GEOIP ENRICHMENT ===
reader = geoip2.database.Reader(r'inputs\GeoLite2-Country_20250509\GeoLite2-Country_20250509\GeoLite2-Country.mmdb')
def enrich_log_with_country(log):
    # Create a copy of the original log to avoid modifying it directly
    enriched = log.copy()

    # Extract the source IP address from the log
    src_ip = log.get('src_ip')

    # Check if the source IP exists
    if src_ip:
        try:
            # Use the GeoIP reader to get country information for the source IP
            response = reader.country(src_ip)

            # Add the country name to the log under a new key
            enriched['src_geo_country'] = response.country.name
        except Exception:
            # If the IP is invalid or not found, set the country as None
            enriched['src_geo_country'] = None
    else:
        # If there's no source IP in the log, set country as None
        enriched['src_geo_country'] = None

    # Return the enriched log with country information
    return enriched



enriched_logs = [enrich_log_with_country(log) for log in combined_logs]

with open('outputs/enriched_logs.json', 'w') as f:
    json.dump(enriched_logs, f, indent=2)

print("Enrichment complete. Sample:")
print(json.dumps(enriched_logs[:5], indent=2))