Skip to content

Commit 9e2ad03

Browse files
fix(transform): use scheme as domain for file:// and other non-web URLs (#129)
When splitting URLs, file://, about:, and other URLs without a netloc produced an empty string for $domain. This caused all such events to cluster together as a single empty entry in "Top Browser Domains". Now falls back to using the URL scheme (e.g. "file", "about") as the domain when netloc is empty. This groups local file activity under a visible "file" domain label instead of an invisible empty string. Fixes #67
1 parent e13c33c commit 9e2ad03

2 files changed

Lines changed: 21 additions & 6 deletions

File tree

aw_transform/split_url_events.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,16 @@ def split_url_events(events: List[Event]) -> List[Event]:
1414
url = event.data["url"]
1515
parsed_url = urlparse(url)
1616
event.data["$protocol"] = parsed_url.scheme
17-
event.data["$domain"] = (
18-
parsed_url.netloc[4:]
19-
if parsed_url.netloc[:4] == "www."
20-
else parsed_url.netloc
21-
)
17+
netloc = parsed_url.netloc
18+
if netloc:
19+
domain = netloc[4:] if netloc[:4] == "www." else netloc
20+
elif parsed_url.scheme:
21+
# For URLs without a domain (e.g. file://, about:),
22+
# use the scheme as domain so they don't all cluster as empty.
23+
domain = parsed_url.scheme
24+
else:
25+
domain = ""
26+
event.data["$domain"] = domain
2227
event.data["$path"] = parsed_url.path
2328
event.data["$params"] = parsed_url.params
2429
event.data["$options"] = parsed_url.query

tests/test_transforms.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,12 +345,22 @@ def test_url_parse_event():
345345
result = split_url_events([e3])
346346
print(result)
347347
assert result[0].data["$protocol"] == "file"
348-
assert result[0].data["$domain"] == ""
348+
assert result[0].data["$domain"] == "file"
349349
assert result[0].data["$path"] == "/home/johan/myfile.txt"
350350
assert result[0].data["$params"] == ""
351351
assert result[0].data["$options"] == ""
352352
assert result[0].data["$identifier"] == ""
353353

354+
# Test about: URLs
355+
e4 = Event(
356+
data={"url": "about:blank"},
357+
timestamp=now,
358+
duration=timedelta(seconds=1),
359+
)
360+
result = split_url_events([e4])
361+
assert result[0].data["$protocol"] == "about"
362+
assert result[0].data["$domain"] == "about"
363+
354364

355365
def test_union():
356366
now = datetime.now(timezone.utc)

0 commit comments

Comments
 (0)