Skip to content

Commit

Permalink
Merge commit '0717f36b22242f9b9da9ffdc2778455a0984eda8'
Browse files Browse the repository at this point in the history
  • Loading branch information
chfoo committed Feb 19, 2015
2 parents bb93b41 + 0717f36 commit ad06324
Show file tree
Hide file tree
Showing 50 changed files with 819 additions and 211 deletions.
11 changes: 11 additions & 0 deletions doc/changelog.rst
Expand Up @@ -2,6 +2,17 @@
What's New
==========


0.1007 (2015-02-19)
===================

* Fixed malformed URLs printed to logs without sanitation.
* Fixed AttributeError crash on FTP servers that support MLSD.
* Improved link recursion heuristics when extracting from JavaScript and HTML.
* Added ``--retr-symlinks``.
* Added ``--session-timeout``.


0.1006.1 (2015-02-09)
=====================

Expand Down
3 changes: 3 additions & 0 deletions test/README.md
Expand Up @@ -5,3 +5,6 @@ This directory contains tests that are outside the normal unit tests.

* `fuzz_fusil`: Fuzz testing with single HTML pages
* `fuzz_fusil_2`: Fuzz testing with a web server
* `perf_profile`: CPU profiling helper script. See `wpull/__main__.py` for details on how the profile file is created.

The tests may require huhhttp to be installed or available on the Python path.
62 changes: 62 additions & 0 deletions test/perf_profile/runner.py
@@ -0,0 +1,62 @@
'''Profiling runner.'''

import os
import subprocess
import time
import atexit


def main():
server_proc = subprocess.Popen([
'python3', '-m', 'huhhttp', '--port=8855', '--seed=4567',
'--fuzz-period=999999'
])

client_env = {
'PYTHONPATH': os.path.join(
os.path.abspath(os.path.dirname(__file__)), '..', '..'
),
'RUN_PROFILE': '1',
}

client_proc = subprocess.Popen(
[
'python3', '-m', 'wpull', 'localhost:8855', '--waitretry=0',
'--timeout=0.5', '-r', '--page-requisites', '-l=4', '--tries=1',
'--delete-after',
],
env=client_env
)

def cleanup():
if server_proc.returncode is None:
try:
server_proc.terminate()
except OSError:
pass
time.sleep(0.1)

if client_proc.returncode is None:
try:
client_proc.terminate()
except OSError:
pass
time.sleep(0.1)

try:
server_proc.kill()
except OSError:
pass
try:
client_proc.kill()
except OSError:
pass

atexit.register(cleanup)

client_proc.wait()
cleanup()


if __name__ == '__main__':
main()
6 changes: 4 additions & 2 deletions wpull/__main__.py
Expand Up @@ -49,8 +49,10 @@ def main(exit=True, install_tornado_bridge=True, prefer_trollius=True):
if os.environ.get('RUN_PROFILE'):
import cProfile
cProfile.run('main()', 'stats-{0}.profile'.format(int(time.time())))
# I suggest installing runsnakerun to view the profile file graphically
# Or, for Python 3.4, use kcachegrind and pyprof2calltree
# For Python 3.2, I suggest installing runsnakerun to view the
# profile file graphically
# For Python 3.4, use kcachegrind and pyprof2calltree, or
# try snakeviz
elif os.environ.get('RUN_PDB'):
import pdb

Expand Down
37 changes: 25 additions & 12 deletions wpull/abstract/client.py
Expand Up @@ -7,11 +7,16 @@
import trollius

from wpull.connection import ConnectionPool
from wpull.errors import NetworkTimedOut


_logger = logging.getLogger(__name__)


class DurationTimeout(NetworkTimedOut):
'''Download did not complete within specified time.'''


class BaseClient(object, metaclass=abc.ABCMeta):
'''Base client.'''
def __init__(self, connection_pool=None, recorder=None,
Expand All @@ -36,6 +41,7 @@ def __init__(self, connection_pool=None, recorder=None,
@abc.abstractmethod
def _session_class(self):
'''Return session class.'''
return BaseSession # return something for code checkers

@contextlib.contextmanager
def session(self):
Expand All @@ -58,11 +64,12 @@ def session(self):
yield session
except Exception as error:
if not isinstance(error, StopIteration):
_logger.debug('Close session.')
session.close()
_logger.debug('Early close session.')
session.abort()
session.recycle()
raise
finally:
session.clean()
else:
session.recycle()
else:
session = self._session_class()(
connection_pool=self._connection_pool,
Expand All @@ -72,11 +79,12 @@ def session(self):
yield session
except Exception as error:
if not isinstance(error, StopIteration):
_logger.debug('Close session.')
session.close()
_logger.debug('Early close session.')
session.abort()
session.recycle()
raise
finally:
session.clean()
else:
session.recycle()

def close(self):
'''Close the connection pool and recorders.'''
Expand All @@ -95,14 +103,19 @@ def __init__(self, connection_pool=None, recorder_session=None,
self._connection_pool = connection_pool
self._recorder_session = recorder_session
self._proxy_adapter = proxy_adapter
self._request = None
self._connection = None

@abc.abstractmethod
def close(self):
'''Close any connections.'''
def abort(self):
'''Terminate early and close any connections.'''

@abc.abstractmethod
def clean(self):
'''Return connection back to the pool.'''
def recycle(self):
'''Clean up and return connection back to the pool.
Connections should be kept alive if supported.
'''

@trollius.coroutine
def _check_out_connection(self, request):
Expand Down
74 changes: 68 additions & 6 deletions wpull/app_test.py
Expand Up @@ -196,6 +196,7 @@ def test_app_args(self):
'--no-strong-crypto',
'--no-skip-getaddrinfo',
'--limit-rate', '1m',
'--session-timeout', '300',
])
with cd_tempdir():
builder = Builder(args, unit_test=True)
Expand Down Expand Up @@ -980,9 +981,6 @@ def test_page_requisite_level(self):
self.assertEqual(0, exit_code)
self.assertEqual(2, builder.factory['Statistics'].files)

# FIXME: not entirely working yet in JS scraper
# it still grabs too much
@unittest.skip('not entirely working yet in JS scraper')
@wpull.testing.async.async_test(timeout=DEFAULT_TIMEOUT)
def test_link_type(self):
arg_parser = AppArgumentParser()
Expand Down Expand Up @@ -1201,7 +1199,7 @@ def test_ssl_bad_certificate(self):

class MockWebSession(WebSession):
@trollius.coroutine
def fetch(self, file=None, callback=None):
def fetch(self, file=None, callback=None, duration_timeout=None):
raise SSLVerificationError('A very bad certificate!')

class MockWebClient(builder.factory.class_map['WebClient']):
Expand All @@ -1219,6 +1217,7 @@ def session(self, request):


class PhantomJSMixin(object):
@unittest.skipIf(IS_PYPY, 'Broken under Travis CI')
@wpull.testing.async.async_test(timeout=DEFAULT_TIMEOUT)
def test_app_phantomjs(self):
arg_parser = AppArgumentParser()
Expand Down Expand Up @@ -1445,6 +1444,23 @@ def test_no_content(self):
self.assertEqual(0, exit_code)
self.assertEqual(1, builder.factory['Statistics'].files)

@wpull.testing.async.async_test(timeout=DEFAULT_TIMEOUT)
def test_session_timeout(self):
arg_parser = AppArgumentParser()
args = arg_parser.parse_args([
self.get_url('/sleep_long'),
'--tries=1',
'--session-timeout=0.1'
])
builder = Builder(args, unit_test=True)

with cd_tempdir():
app = builder.build()
exit_code = yield From(app.run())

self.assertEqual(4, exit_code)
self.assertEqual(0, builder.factory['Statistics'].files)


class TestAppFTP(FTPTestCase):
def setUp(self):
Expand Down Expand Up @@ -1529,12 +1545,14 @@ def test_args(self):
exit_code = yield From(app.run())

self.assertEqual(8, exit_code)
self.assertEqual(5, builder.factory['Statistics'].files)
self.assertEqual(6, builder.factory['Statistics'].files)

print(os.listdir())
print(os.listdir('.'))

self.assertTrue(os.path.exists('.listing'))
self.assertTrue(os.path.exists('example.txt'))
self.assertTrue(os.path.exists('readme.txt'))
self.assertFalse(os.path.islink('readme.txt'))
self.assertTrue(os.path.exists('example1/.listing'))
self.assertTrue(os.path.exists('example2/.listing'))
self.assertTrue(os.path.exists('mywarc.warc.gz'))
Expand All @@ -1547,6 +1565,50 @@ def test_args(self):
.encode('utf-8'),
data)

@wpull.testing.async.async_test(timeout=DEFAULT_TIMEOUT)
def test_retr_symlinks_off(self):
arg_parser = AppArgumentParser()
args = arg_parser.parse_args([
self.get_url('/'),
'-r',
'--level', '1',
'--tries', '1',
'--no-host-directories',
'--retr-symlinks=off',
])
builder = Builder(args, unit_test=True)

with cd_tempdir():
app = builder.build()
exit_code = yield From(app.run())

self.assertEqual(0, exit_code)

print(os.listdir('.'))

self.assertTrue(os.path.exists('example.txt'))
self.assertTrue(os.path.exists('readme.txt'))
self.assertTrue(os.path.islink('readme.txt'))

# TODO: todo
# @wpull.testing.async.async_test(timeout=DEFAULT_TIMEOUT)
# def test_file_vs_directory(self):
# arg_parser = AppArgumentParser()
# args = arg_parser.parse_args([
# self.get_url('/example2'),
# '--no-host-directories',
# '--no-remove-listing',
# ])
# builder = Builder(args, unit_test=True)
#
# with cd_tempdir():
# app = builder.build()
# exit_code = yield From(app.run())
#
# self.assertEqual(0, exit_code)
#
# self.assertTrue(os.path.exists('example2/.listing'))


@trollius.coroutine
def tornado_future_adapter(future):
Expand Down
16 changes: 12 additions & 4 deletions wpull/builder.py
Expand Up @@ -13,7 +13,6 @@
import sys
import tempfile

import tornado.testing
import tornado.netutil
import tornado.web
import trollius
Expand All @@ -24,12 +23,14 @@
from wpull.connection import Connection, ConnectionPool, SSLConnection
from wpull.converter import BatchDocumentConverter
from wpull.cookie import DeFactoCookiePolicy, RelaxedMozillaCookieJar
from wpull.coprocessor.phantomjs import PhantomJSCoprocessor, PhantomJSParams
from wpull.coprocessor.proxy import ProxyCoprocessor
from wpull.coprocessor.youtubedl import YoutubeDlCoprocessor
from wpull.database.sqltable import URLTable as SQLURLTable, GenericSQLURLTable
from wpull.database.wrap import URLTableHookWrapper
from wpull.debug import DebugConsoleHandler
from wpull.dns import Resolver, PythonResolver
from wpull.driver.phantomjs import PhantomJSDriver
from wpull.engine import Engine
from wpull.factory import Factory
from wpull.ftp.client import Client as FTPClient
Expand All @@ -42,14 +43,12 @@
from wpull.http.stream import Stream as HTTPStream
from wpull.http.web import WebClient
from wpull.namevalue import NameValueRecord
from wpull.driver.phantomjs import PhantomJSDriver
from wpull.options import LOG_QUIET, LOG_VERY_QUIET, LOG_NO_VERBOSE, LOG_VERBOSE, \
LOG_DEBUG
from wpull.processor.delegate import DelegateProcessor
from wpull.processor.ftp import FTPProcessor, FTPProcessorFetchParams, \
FTPProcessorInstances
from wpull.processor.rule import FetchRule, ResultRule, ProcessingRule
from wpull.coprocessor.phantomjs import PhantomJSCoprocessor, PhantomJSParams
from wpull.processor.web import WebProcessor, WebProcessorFetchParams, \
WebProcessorInstances
from wpull.proxy import HTTPProxyServer
Expand Down Expand Up @@ -79,6 +78,8 @@
from wpull.writer import (PathNamer, NullWriter, OverwriteFileWriter,
IgnoreFileWriter, TimestampingFileWriter,
AntiClobberFileWriter)
import wpull.coprocessor.youtubedl
import wpull.driver.phantomjs
import wpull.version


Expand Down Expand Up @@ -795,6 +796,7 @@ def _build_web_processor(self):
url_filter=url_filter, robots_txt_checker=robots_txt_checker,
http_login=(http_username, http_password),
ftp_login=(ftp_username, ftp_password),
duration_timeout=args.session_timeout,
)

waiter = self._factory.new('Waiter',
Expand Down Expand Up @@ -869,7 +871,8 @@ def _build_ftp_processor(self):

fetch_params = self._factory.new(
'FTPProcessorFetchParams',
remove_listing=self._args.remove_listing
remove_listing=self._args.remove_listing,
retr_symlinks=self._args.retr_symlinks,
)

instances = self._factory.new(
Expand Down Expand Up @@ -1463,6 +1466,11 @@ def _warn_unsafe_options(self):
_('The use of unsafe options may lead to unexpected behavior '
'or file corruption.'))

if not self._args.retr_symlinks:
_logger.warning(
_('The --retr-symlinks=off option is a security risk.')
)

def _get_stderr(self):
'''Return stderr or something else if under unit testing.'''
if self._unit_test:
Expand Down

0 comments on commit ad06324

Please sign in to comment.