Skip to content

Commit

Permalink
Merge commit '1af45d06f021166189bef471dbc10f1f1e1fbf47'
Browse files Browse the repository at this point in the history
  • Loading branch information
chfoo committed Apr 3, 2015
2 parents 2ff0a52 + 1af45d0 commit a7ca8c2
Show file tree
Hide file tree
Showing 18 changed files with 420 additions and 194 deletions.
12 changes: 12 additions & 0 deletions doc/changelog.rst
Expand Up @@ -7,6 +7,18 @@ Summary of notable changes.
.. Take advice from http://keepachangelog.com/.
1.1 (2015-04-03)
================

* Security: Updated certificate bundle.
* Fixed: ``--regex-type`` to accept ``pcre`` instead of ``posix``. Regular expressions always use Python's regex library. Posix regex is not supported.
* Fixed: when using ``--warc-max-size`` and ``--warc-append``, it wrote to existing sequential WARC files unnecessarily.
* Fixed: input URLs stored in memory instead of saved on disk. This issue was notable if there were many URLs provided by the ``--input-file`` option.
* Changed: when using ``--warc-max-size`` and ``--warc-append``, the next sequential WARC file is created to avoid appending to corrupt files.
* Changed: WARC file writing to use journal files and refuse to start program if any journals exist. This avoids corrupting files through naive use of ``--warc-append`` and allow for future automated recovery.
* Added: Open Graph and Twitter Card element links extraction.


1.0 (2015-03-14)
================

Expand Down
2 changes: 1 addition & 1 deletion doc/differences.rst
Expand Up @@ -23,7 +23,7 @@ Missing in Wpull
* ``--no-passive-ftp``
* ``--mirror``
* ``--strict-comments``: No plans for support of this option.
* No support yet for using SSL/TLS connections on proxies. They will be unencrypted to the destination.
* ``--regex-type=posix``: No plans to support posix regex.
* Features greater than Wget 1.15.


Expand Down
2 changes: 1 addition & 1 deletion doc/install.rst
Expand Up @@ -75,7 +75,7 @@ Install Wpull from GitHub::
Pre-built Binaries
==================

Wpull has pre-built binaries located at https://launchpad.net/wpull/+download. These are unsupported and may not be update to date.
Wpull has pre-built binaries located at https://launchpad.net/wpull/+download. These are unsupported and may not be up to date.


Caveats
Expand Down
2 changes: 1 addition & 1 deletion wpull/body.py
Expand Up @@ -93,7 +93,7 @@ def __iter__(self):
def new_temp_file(directory=None, hint=''):
'''Return a new temporary file.'''
return tempfile.NamedTemporaryFile(
prefix='wpull-{0}-'.format(hint), suffix='.tmp', dir=directory)
prefix='tmp-wpull-{0}-'.format(hint), suffix='.tmp', dir=directory)


def is_seekable(file):
Expand Down
38 changes: 28 additions & 10 deletions wpull/builder.py
Expand Up @@ -8,6 +8,7 @@
import itertools
import logging
import os.path
import shelve
import socket
import ssl
import sys
Expand Down Expand Up @@ -160,7 +161,10 @@ def __init__(self, args, unit_test=False):
'WebProcessorInstances': WebProcessorInstances,
'YoutubeDlCoprocessor': YoutubeDlCoprocessor,
})
self._url_infos = None
self._input_urls_temp_dir = tempfile.TemporaryDirectory(
prefix='tmp-wpull', dir=os.getcwd())
self._input_urls_db = shelve.open(
os.path.join(self._input_urls_temp_dir.name, 'input_urls.db'))
self._ca_certs_file = None
self._file_log_handler = None
self._console_log_handler = None
Expand Down Expand Up @@ -196,11 +200,15 @@ def build(self):
resource_monitor = self._build_resource_monitor()

self._build_demux_document_scraper()
self._url_infos = tuple(self._build_input_urls())
for url_info in self._build_input_urls():
self._input_urls_db[url_info.url] = url_info

statistics = self._factory.new('Statistics')
statistics.quota = self._args.quota
statistics.required_url_infos.update(self._url_infos)

if self._args.quota:
for url_info in self._input_urls_db.values():
statistics.required_urls_db[url_info.url] = True

url_table = self._build_url_table()
processor = self._build_processor()
Expand All @@ -223,9 +231,19 @@ def build(self):
self._warn_unsafe_options()
self._warn_silly_options()

url_table.add_many(
[{'url': url_info.url} for url_info in self._url_infos]
)
batch = []

for url_info in self._input_urls_db.values():
batch.append({'url': url_info.url})
if len(batch) > 1000:
url_table.add_many(batch)
batch = []

url_table.add_many(batch)

self._input_urls_db.close()
self._input_urls_temp_dir.cleanup()
self._input_urls_temp_dir = None

return self._factory['Application']

Expand Down Expand Up @@ -506,7 +524,7 @@ def _read_input_file_as_lines(self):
input_file = codecs.getreader(
self._args.local_encoding or 'utf-8')(self._args.input_file)

urls = [line.strip() for line in input_file if line.strip()]
urls = (line.strip() for line in input_file if line.strip())

if not urls:
raise ValueError(_('No URLs found in input file.'))
Expand All @@ -519,7 +537,7 @@ def _read_input_file_as_html(self):
self._args.input_file,
encoding=self._args.local_encoding or 'utf-8'
)
links = [context.link for context in scrape_result.link_contexts]
links = (context.link for context in scrape_result.link_contexts)

return links

Expand All @@ -537,7 +555,7 @@ def _build_url_filters(self):
enabled=args.recursive, page_requisites=args.page_requisites
),
SpanHostsFilter(
self._url_infos,
(url_info for url_info in self._input_urls_db.values()),
enabled=args.span_hosts,
page_requisites='page-requisites' in args.span_hosts_allow,
linked_pages='linked-pages' in args.span_hosts_allow,
Expand Down Expand Up @@ -1427,7 +1445,7 @@ def _load_ca_certs(self):
))

self._ca_certs_file = certs_filename = tempfile.mkstemp(
suffix='.pem', prefix='wpull-')[1]
suffix='.pem', prefix='tmp-wpull-')[1]

def clean_certs_file():
os.remove(certs_filename)
Expand Down

0 comments on commit a7ca8c2

Please sign in to comment.