Merge commit '1af45d06f021166189bef471dbc10f1f1e1fbf47'

ArchiveTeam · Apr 3, 2015 · a7ca8c2 · a7ca8c2
2 parents 2ff0a52 + 1af45d0
commit a7ca8c2
Show file tree

Hide file tree

Showing 18 changed files with 420 additions and 194 deletions.
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -7,6 +7,18 @@ Summary of notable changes.
 .. Take advice from http://keepachangelog.com/.
 
 
+1.1 (2015-04-03)
+================
+
+* Security: Updated certificate bundle.
+* Fixed: ``--regex-type`` to accept ``pcre`` instead of ``posix``. Regular expressions always use Python's regex library. Posix regex is not supported.
+* Fixed: when using ``--warc-max-size`` and ``--warc-append``, it wrote to existing sequential WARC files unnecessarily.
+* Fixed: input URLs stored in memory instead of saved on disk. This issue was notable if there were many URLs provided by the ``--input-file`` option.
+* Changed: when using ``--warc-max-size`` and ``--warc-append``, the next sequential WARC file is created to avoid appending to corrupt files.
+* Changed: WARC file writing to use journal files and refuse to start program if any journals exist. This avoids corrupting files through naive use of ``--warc-append`` and allow for future automated recovery.
+* Added: Open Graph and Twitter Card element links extraction.
+
+
 1.0 (2015-03-14)
 ================
 

diff --git a/doc/differences.rst b/doc/differences.rst
@@ -23,7 +23,7 @@ Missing in Wpull
 * ``--no-passive-ftp``
 * ``--mirror``
 * ``--strict-comments``: No plans for support of this option.
-* No support yet for using SSL/TLS connections on proxies. They will be unencrypted to the destination.
+* ``--regex-type=posix``: No plans to support posix regex.
 * Features greater than Wget 1.15.
 
 

diff --git a/doc/install.rst b/doc/install.rst
@@ -75,7 +75,7 @@ Install Wpull from GitHub::
 Pre-built Binaries
 ==================
 
-Wpull has pre-built binaries located at https://launchpad.net/wpull/+download. These are unsupported and may not be update to date.
+Wpull has pre-built binaries located at https://launchpad.net/wpull/+download. These are unsupported and may not be up to date.
 
 
 Caveats

diff --git a/wpull/body.py b/wpull/body.py
@@ -93,7 +93,7 @@ def __iter__(self):
 def new_temp_file(directory=None, hint=''):
     '''Return a new temporary file.'''
     return tempfile.NamedTemporaryFile(
-        prefix='wpull-{0}-'.format(hint), suffix='.tmp', dir=directory)
+        prefix='tmp-wpull-{0}-'.format(hint), suffix='.tmp', dir=directory)
 
 
 def is_seekable(file):

diff --git a/wpull/builder.py b/wpull/builder.py
@@ -8,6 +8,7 @@
 import itertools
 import logging
 import os.path
+import shelve
 import socket
 import ssl
 import sys
@@ -160,7 +161,10 @@ def __init__(self, args, unit_test=False):
             'WebProcessorInstances': WebProcessorInstances,
             'YoutubeDlCoprocessor': YoutubeDlCoprocessor,
         })
-        self._url_infos = None
+        self._input_urls_temp_dir = tempfile.TemporaryDirectory(
+            prefix='tmp-wpull', dir=os.getcwd())
+        self._input_urls_db = shelve.open(
+            os.path.join(self._input_urls_temp_dir.name, 'input_urls.db'))
         self._ca_certs_file = None
         self._file_log_handler = None
         self._console_log_handler = None
@@ -196,11 +200,15 @@ def build(self):
         resource_monitor = self._build_resource_monitor()
 
         self._build_demux_document_scraper()
-        self._url_infos = tuple(self._build_input_urls())
+        for url_info in self._build_input_urls():
+            self._input_urls_db[url_info.url] = url_info
 
         statistics = self._factory.new('Statistics')
         statistics.quota = self._args.quota
-        statistics.required_url_infos.update(self._url_infos)
+
+        if self._args.quota:
+            for url_info in self._input_urls_db.values():
+                statistics.required_urls_db[url_info.url] = True
 
         url_table = self._build_url_table()
         processor = self._build_processor()
@@ -223,9 +231,19 @@ def build(self):
         self._warn_unsafe_options()
         self._warn_silly_options()
 
-        url_table.add_many(
-            [{'url': url_info.url} for url_info in self._url_infos]
-        )
+        batch = []
+
+        for url_info in self._input_urls_db.values():
+            batch.append({'url': url_info.url})
+            if len(batch) > 1000:
+                url_table.add_many(batch)
+                batch = []
+
+        url_table.add_many(batch)
+
+        self._input_urls_db.close()
+        self._input_urls_temp_dir.cleanup()
+        self._input_urls_temp_dir = None
 
         return self._factory['Application']
 
@@ -506,7 +524,7 @@ def _read_input_file_as_lines(self):
         input_file = codecs.getreader(
             self._args.local_encoding or 'utf-8')(self._args.input_file)
 
-        urls = [line.strip() for line in input_file if line.strip()]
+        urls = (line.strip() for line in input_file if line.strip())
 
         if not urls:
             raise ValueError(_('No URLs found in input file.'))
@@ -519,7 +537,7 @@ def _read_input_file_as_html(self):
             self._args.input_file,
             encoding=self._args.local_encoding or 'utf-8'
         )
-        links = [context.link for context in scrape_result.link_contexts]
+        links = (context.link for context in scrape_result.link_contexts)
 
         return links
 
@@ -537,7 +555,7 @@ def _build_url_filters(self):
                 enabled=args.recursive, page_requisites=args.page_requisites
             ),
             SpanHostsFilter(
-                self._url_infos,
+                (url_info for url_info in self._input_urls_db.values()),
                 enabled=args.span_hosts,
                 page_requisites='page-requisites' in args.span_hosts_allow,
                 linked_pages='linked-pages' in args.span_hosts_allow,
@@ -1427,7 +1445,7 @@ def _load_ca_certs(self):
                 ))
 
         self._ca_certs_file = certs_filename = tempfile.mkstemp(
-            suffix='.pem', prefix='wpull-')[1]
+            suffix='.pem', prefix='tmp-wpull-')[1]
 
         def clean_certs_file():
             os.remove(certs_filename)