diff --git a/wpull/warc/format.py b/wpull/warc/format.py index 93245576..21f02d8a 100644 --- a/wpull/warc/format.py +++ b/wpull/warc/format.py @@ -163,7 +163,7 @@ def get_http_header(self) -> Response: with wpull.util.reset_file_offset(self.block_file): data = self.block_file.read(4096) - match = re.match(br'(.*?\r?\n\r?\n)', data) + match = re.match(br'(.*?\r?\n\r?\n)', data, re.DOTALL) if not match: return diff --git a/wpull/warc/recorder.py b/wpull/warc/recorder.py index 3d8c3dc8..c4790190 100644 --- a/wpull/warc/recorder.py +++ b/wpull/warc/recorder.py @@ -579,7 +579,15 @@ def response_data(self, data: bytes): self._response_temp_file.write(data) def end_response(self, response: HTTPResponse): - payload_offset = len(response.to_bytes()) + self._response_record.block_file.seek(0) + while True: + data = self._response_record.block_file.readline() + if data in (b'\r\n', b'\n'): + payload_offset = self._response_record.block_file.tell() + break + if not data: + payload_offset = 0 + break self._response_record.block_file.seek(0) self._recorder.set_length_and_maybe_checksums( diff --git a/wpull/warc/recorder_test.py b/wpull/warc/recorder_test.py index bf5fe41b..44ba6b90 100644 --- a/wpull/warc/recorder_test.py +++ b/wpull/warc/recorder_test.py @@ -51,6 +51,11 @@ def test_warc_recorder(self): warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' + response_header_bytes = b"HTTP/1.1 200 OK\r\n" + response_header_bytes += b"Content-Type: text/plain\r\n" + response_header_bytes += b"X-Empty-Field: \r\n" + response_header_bytes += b"\r\n" + warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( @@ -65,6 +70,7 @@ def test_warc_recorder(self): request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') + response.parse(response_header_bytes) response.body = Body() with wpull.util.reset_file_offset(response.body): @@ -75,7 +81,7 @@ def test_warc_recorder(self): session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) - session.response_data(response.to_bytes()) + session.response_data(response_header_bytes) session.response_data(response.body.content()) session.end_response(response) session.close() @@ -135,7 +141,7 @@ def test_warc_recorder(self): self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) - self.assertEqual(b'-', cdx_fields[2]) + self.assertEqual(b'text/plain', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5])