From 9187761ac65a38c4879b8ca2b9c6ea64833a583e Mon Sep 17 00:00:00 2001 From: John Brooks Date: Sun, 26 Mar 2017 13:43:12 -0400 Subject: [PATCH 1/2] warc: Fix bad payload hash when HTTP response headers have extra whitespace The payload offset was being obtained by taking len(response.to_bytes()), but since leading/trailing whitespace is discarded from the response class's name/value pairs, the length of the generated string would not necessarily reflect the actual size of the received headers, leading to a checksum being calculated from the wrong position in the file. To prevent this, the WARC recorder will now independently figure out where the headers really end in the file. --- wpull/warc/recorder.py | 10 +++++++++- wpull/warc/recorder_test.py | 10 ++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/wpull/warc/recorder.py b/wpull/warc/recorder.py index 3d8c3dc8..c4790190 100644 --- a/wpull/warc/recorder.py +++ b/wpull/warc/recorder.py @@ -579,7 +579,15 @@ def response_data(self, data: bytes): self._response_temp_file.write(data) def end_response(self, response: HTTPResponse): - payload_offset = len(response.to_bytes()) + self._response_record.block_file.seek(0) + while True: + data = self._response_record.block_file.readline() + if data in (b'\r\n', b'\n'): + payload_offset = self._response_record.block_file.tell() + break + if not data: + payload_offset = 0 + break self._response_record.block_file.seek(0) self._recorder.set_length_and_maybe_checksums( diff --git a/wpull/warc/recorder_test.py b/wpull/warc/recorder_test.py index bf5fe41b..44ba6b90 100644 --- a/wpull/warc/recorder_test.py +++ b/wpull/warc/recorder_test.py @@ -51,6 +51,11 @@ def test_warc_recorder(self): warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' + response_header_bytes = b"HTTP/1.1 200 OK\r\n" + response_header_bytes += b"Content-Type: text/plain\r\n" + response_header_bytes += b"X-Empty-Field: \r\n" + response_header_bytes += b"\r\n" + warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( @@ -65,6 +70,7 @@ def test_warc_recorder(self): request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') + response.parse(response_header_bytes) response.body = Body() with wpull.util.reset_file_offset(response.body): @@ -75,7 +81,7 @@ def test_warc_recorder(self): session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) - session.response_data(response.to_bytes()) + session.response_data(response_header_bytes) session.response_data(response.body.content()) session.end_response(response) session.close() @@ -135,7 +141,7 @@ def test_warc_recorder(self): self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) - self.assertEqual(b'-', cdx_fields[2]) + self.assertEqual(b'text/plain', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) From 451cd2ef7fe1f20b6b977916cd2d57c1fa721ce1 Mon Sep 17 00:00:00 2001 From: John Brooks Date: Sun, 26 Mar 2017 13:56:46 -0400 Subject: [PATCH 2/2] warc: Fix CDX fields missing with multi-line HTTP headers The regex used to find the end of the HTTP headers would not match if there was a newline in between, so get_http_header would return nothing and the status code and MIME type fields would be empty in the CDX record. --- wpull/warc/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wpull/warc/format.py b/wpull/warc/format.py index 93245576..21f02d8a 100644 --- a/wpull/warc/format.py +++ b/wpull/warc/format.py @@ -163,7 +163,7 @@ def get_http_header(self) -> Response: with wpull.util.reset_file_offset(self.block_file): data = self.block_file.read(4096) - match = re.match(br'(.*?\r?\n\r?\n)', data) + match = re.match(br'(.*?\r?\n\r?\n)', data, re.DOTALL) if not match: return