Creating a simple dataset.

In [None]:
%%writefile weblogs.txt
# Date, Time, IP, Method, URL, Status, ResponseSize
2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024
2025-10-10,12:01:33,192.168.1.3,GET,/products.html,200,850
2025-10-10,12:01:35,192.168.1.4,GET,/contact.html,404,512
2025-10-10,12:01:38,192.168.1.5,POST,/checkout,500,128
2025-10-10,12:01:41,192.168.1.6,GET,/index.html,200,1024
2025-10-10,12:01:45,192.168.1.7,GET,/images/logo.png,200,256
2025-10-10,12:01:48,192.168.1.8,GET,/about.html,404,512
2025-10-10,12:01:53,192.168.1.9,POST,/login,403,64
2025-10-10,12:02:01,192.168.1.10,GET,/index.html,200,1024
2025-10-10,12:02:07,192.168.1.11,POST,/checkout,500,128
2025-10-10,12:02:12,192.168.1.12,GET,/contact.html,404,512
2025-10-10,12:02:15,192.168.1.13,GET,/index.html,200,1024
2025-10-10,12:02:21,192.168.1.14,GET,/products.html,200,850
2025-10-10,12:02:23,192.168.1.15,GET,/about.html,404,512
2025-10-10,12:02:29,192.168.1.16,POST,/checkout,500,128
2025-10-10,12:02:31,192.168.1.17,GET,/images/logo.png,200,256
2025-10-10,12:02:34,192.168.1.18,GET,/contact.html,404,512
2025-10-10,12:02:38,192.168.1.19,POST,/login,403,64
2025-10-10,12:02:41,192.168.1.20,GET,/index.html,200,1024
2025-10-10,12:02:47,192.168.1.21,GET,/products.html,200,850

Overwriting weblogs.txt


Implement the Mapper

In [12]:
def mapper(line):
    fields = line.strip().split(',')

    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    status = fields[5]

    return [(status, 1)]

Shuffle Phase

In [13]:
from collections import defaultdict

def shuffle(mapped_data):
    grouped = defaultdict(list)
    for key, value in mapped_data:
        grouped[key].append(value)
    return grouped

Reducer Phase

In [14]:
from collections import defaultdict

def reducer(mapped_data):
    grouped = defaultdict(int)
    for key, value in mapped_data:
       grouped[key] += value
    return grouped

Combine the Phases

In [15]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper(line))

reduced = reducer(mapped)

for code, count in sorted(reduced.items()):
    print(f"HTTP {code}: {count} requests")

HTTP 200: 10 requests
HTTP 403: 2 requests
HTTP 404: 5 requests
HTTP 500: 3 requests


 1) Count requests per URL

In [None]:
def mapper_url(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    url = fields[4].strip()
    return [(url, 1)]

# Map phase
mapped_url = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped_url.extend(mapper_url(line))

# Reduce
from collections import defaultdict
def reducer_sum(mapped_data):
    grouped = defaultdict(int)
    for key, value in mapped_data:
        grouped[key] += value
    return grouped

reduced_url = reducer_sum(mapped_url)

for url, count in sorted(reduced_url.items(), key=lambda kv: (-kv[1], kv[0]))[:20]:
    print(f"{url}: {count} requests")

/index.html: 5 requests
/checkout: 3 requests
/contact.html: 3 requests
/products.html: 3 requests
/about.html: 2 requests
/images/logo.png: 2 requests
/login: 2 requests


2) Compute total response size per status code

In [None]:
# Mapper
def mapper_status_size(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    status = fields[5].strip()
    try:
        size = int(fields[6].strip())
    except:
        size = 0
    return [(status, size)]

# Map phase
mapped_status_size = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped_status_size.extend(mapper_status_size(line))

# Reducer
from collections import defaultdict
def reducer_sum_sizes(mapped_data):
    grouped = defaultdict(int)
    for key, value in mapped_data:
        grouped[key] += value  
    return grouped

reduced_sizes = reducer_sum_sizes(mapped_status_size)

for status in sorted(reduced_sizes.keys()):
    print(f"HTTP {status}: total response size = {reduced_sizes[status]} bytes")

HTTP 200: total response size = 8182 bytes
HTTP 403: total response size = 128 bytes
HTTP 404: total response size = 2560 bytes
HTTP 500: total response size = 384 bytes


3) Filter out successful responses (status 200) â€” analyze only errors

In [None]:
def mapper_errors_only(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    status = fields[5].strip()
    if status == '200':
        return []  
    return [(status, 1)]

# Map
mapped_errors = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped_errors.extend(mapper_errors_only(line))

# Reduce 
reduced_errors = reducer_sum(mapped_errors)

for status, count in sorted(reduced_errors.items(), key=lambda kv: (-int(kv[0]) if kv[0].isdigit() else 0, -kv[1])):
    print(f"HTTP {status}: {count} error requests")

HTTP 500: 3 error requests
HTTP 404: 5 error requests
HTTP 403: 2 error requests


In [None]:
def mapper_errors_size(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    status = fields[5].strip()
    if status == '200':
        return []
    try:
        size = int(fields[6].strip())
    except:
        size = 0
    return [(status, size)]

# Map
mapped_errors_size = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped_errors_size.extend(mapper_errors_size(line))

# Reduce 
reduced_errors_sizes = reducer_sum_sizes(mapped_errors_size)

for status in sorted(reduced_errors_sizes.keys()):
    print(f"HTTP {status}: total error response size = {reduced_errors_sizes[status]} bytes")


HTTP 403: total error response size = 128 bytes
HTTP 404: total error response size = 2560 bytes
HTTP 500: total error response size = 384 bytes
