/
15-heritrix.conf
35 lines (34 loc) · 1.14 KB
/
15-heritrix.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
filter {
if [type] == "heritrix" {
grok {
match => [ "message", "%{NOTSPACE:log_timestamp} +%{NUMBER:fetch_status_code:int} +%{NOTSPACE:resource_size:int} %{NOTSPACE:downloaded_uri} %{NOTSPACE:discovery_path} %{NOTSPACE:referrer_uri} %{NOTSPACE:mime_type} %{NOTSPACE:worker_thread_id} %{NOTSPACE:fetch_timestamp} %{NOTSPACE:sha1_digest} %{NOTSPACE:source_tag} %{NOTSPACE:annotations}" ]
}
date {
match => [ "log_timestamp", "ISO8601" ]
}
mutate {
split => [ "annotations", "," ]
}
if [downloaded_uri] =~ /^http.*/ {
grok {
match => [ "downloaded_uri", "%{WORD:downloaded_uri_scheme}://%{HOSTNAME:downloaded_uri_host}(?:%{NOTSPACE:downloaded_uri_path_and_query}|)" ]
}
} else if [downloaded_uri] != "-" {
grok {
match => [ "downloaded_uri", "%{WORD:downloaded_uri_scheme}:%{NOTSPACE:downloaded_uri_host}" ]
}
}
if [fetch_timestamp] != "-" {
grok {
match => [ "fetch_timestamp", "%{POSINT:fetch_start}\+%{NONNEGINT:fetch_duration:int}" ]
}
}
}
}
output {
if [type] == "heritrix" {
elasticsearch {
index => "logstash-heritrix-%{[fields][crawl_id]}-%{+YYYY.MM.dd}"
}
}
}