From 9187761ac65a38c4879b8ca2b9c6ea64833a583e Mon Sep 17 00:00:00 2001
From: John Brooks <john@fastquake.com>
Date: Sun, 26 Mar 2017 13:43:12 -0400
Subject: [PATCH 1/2] warc: Fix bad payload hash when HTTP response headers
 have extra whitespace

The payload offset was being obtained by taking len(response.to_bytes()),
but since leading/trailing whitespace is discarded from the response
class's name/value pairs, the length of the generated string would not
necessarily reflect the actual size of the received headers, leading to a
checksum being calculated from the wrong position in the file.

To prevent this, the WARC recorder will now independently figure out where
the headers really end in the file.
---
 wpull/warc/recorder.py      | 10 +++++++++-
 wpull/warc/recorder_test.py | 10 ++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/wpull/warc/recorder.py b/wpull/warc/recorder.py
index 3d8c3dc8..c4790190 100644
--- a/wpull/warc/recorder.py
+++ b/wpull/warc/recorder.py
@@ -579,7 +579,15 @@ def response_data(self, data: bytes):
         self._response_temp_file.write(data)
 
     def end_response(self, response: HTTPResponse):
-        payload_offset = len(response.to_bytes())
+        self._response_record.block_file.seek(0)
+        while True:
+            data = self._response_record.block_file.readline()
+            if data in (b'\r\n', b'\n'):
+                payload_offset = self._response_record.block_file.tell()
+                break
+            if not data:
+                payload_offset = 0
+                break
 
         self._response_record.block_file.seek(0)
         self._recorder.set_length_and_maybe_checksums(
diff --git a/wpull/warc/recorder_test.py b/wpull/warc/recorder_test.py
index bf5fe41b..44ba6b90 100644
--- a/wpull/warc/recorder_test.py
+++ b/wpull/warc/recorder_test.py
@@ -51,6 +51,11 @@ def test_warc_recorder(self):
         warc_filename = 'asdf.warc'
         cdx_filename = 'asdf.cdx'
 
+        response_header_bytes  = b"HTTP/1.1 200 OK\r\n"
+        response_header_bytes += b"Content-Type: text/plain\r\n"
+        response_header_bytes += b"X-Empty-Field:  \r\n"
+        response_header_bytes += b"\r\n"
+
         warc_recorder = WARCRecorder(
             file_prefix,
             params=WARCRecorderParams(
@@ -65,6 +70,7 @@ def test_warc_recorder(self):
         request.address = ('0.0.0.0', 80)
         request.prepare_for_send()
         response = HTTPResponse(200, 'OK')
+        response.parse(response_header_bytes)
         response.body = Body()
 
         with wpull.util.reset_file_offset(response.body):
@@ -75,7 +81,7 @@ def test_warc_recorder(self):
         session.request_data(request.to_bytes())
         session.end_request(request)
         session.begin_response(response)
-        session.response_data(response.to_bytes())
+        session.response_data(response_header_bytes)
         session.response_data(response.body.content())
         session.end_response(response)
         session.close()
@@ -135,7 +141,7 @@ def test_warc_recorder(self):
         self.assertTrue(cdx_lines[0].startswith(b' CDX'))
 
         self.assertEqual(b'http://example.com/', cdx_fields[0])
-        self.assertEqual(b'-', cdx_fields[2])
+        self.assertEqual(b'text/plain', cdx_fields[2])
         self.assertEqual(b'200', cdx_fields[3])
         self.assertNotEqual(b'-', cdx_fields[4])
         self.assertNotEqual(b'0', cdx_fields[5])

From 451cd2ef7fe1f20b6b977916cd2d57c1fa721ce1 Mon Sep 17 00:00:00 2001
From: John Brooks <john@fastquake.com>
Date: Sun, 26 Mar 2017 13:56:46 -0400
Subject: [PATCH 2/2] warc: Fix CDX fields missing with multi-line HTTP headers

The regex used to find the end of the HTTP headers would not match if
there was a newline in between, so get_http_header would return nothing
and the status code and MIME type fields would be empty in the CDX record.
---
 wpull/warc/format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wpull/warc/format.py b/wpull/warc/format.py
index 93245576..21f02d8a 100644
--- a/wpull/warc/format.py
+++ b/wpull/warc/format.py
@@ -163,7 +163,7 @@ def get_http_header(self) -> Response:
         with wpull.util.reset_file_offset(self.block_file):
             data = self.block_file.read(4096)
 
-        match = re.match(br'(.*?\r?\n\r?\n)', data)
+        match = re.match(br'(.*?\r?\n\r?\n)', data, re.DOTALL)
 
         if not match:
             return