Skip to content
Permalink
Browse files

Move everything and make grab-site installable with pip3

  • Loading branch information...
ivan committed Jul 18, 2015
1 parent 1266cf6 commit 43d8a9594ff18d60c8806c2546e220a20200f3ce
@@ -1 +1 @@
/__pycache__
__pycache__
@@ -37,17 +37,18 @@ echo "global,$igsets" > "$dir/igsets"
touch "$dir/igoff"
touch "$dir/ignores"

LIBGRABSITE="$(python3 -c 'import os, libgrabsite; print(os.path.dirname(libgrabsite.__file__))')"

# Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses
# html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml

GRAB_SITE_WORKING_DIR="$dir" PYTHONPATH="$self" "$self/wpull" \
GRAB_SITE_WORKING_DIR="$dir" "$self/patched-wpull" \
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
--header="Accept-Language: en-US,en;q=0.5" \
-o "$dir/wpull.log" \
--database "$dir/wpull.db" \
--plugin-script "$self/plugin.py" \
--python-script "$self/wpull_hooks.py" \
--plugin-script "$LIBGRABSITE/plugin.py" \
--python-script "$LIBGRABSITE/wpull_hooks.py" \
--plugin-args " --dupes-db $dir/dupes_db" \
--save-cookies "$dir/cookies.txt" \
--no-check-certificate \
@@ -0,0 +1,4 @@
#!/usr/bin/python3

from libgrabsite import server
server.main()
@@ -0,0 +1 @@
__version__ = '0.1.0'
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
@@ -6,8 +6,8 @@
from wpull.document.html import HTMLReader
import wpull.processor.rule

import dupespotter
from dupes import DupesInMemory, DupesOnDisk
from libgrabsite import dupespotter
from libgrabsite.dupes import DupesInMemory, DupesOnDisk



File renamed without changes.
@@ -7,7 +7,7 @@
import trollius as asyncio
from urllib.request import urlopen
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
from ignoracle import Ignoracle, parameterize_record_info
from libgrabsite.ignoracle import Ignoracle, parameterize_record_info

realStdoutWrite = sys.stdout.buffer.write
realStderrWrite = sys.stderr.buffer.write
File renamed without changes.
@@ -0,0 +1,28 @@
#!/usr/bin/python3

try:
from setuptools import setup
except ImportError:
from distutils.core import setup

import libgrabsite

setup(
name="grab-site",
version=libgrabsite.__version__,
description="The archivist's web crawler: WARC output, dashboard for all crawls, dynamic ignore patterns",
url="https://github.com/ludios/grab-site",
author="Ivan Kozik",
author_email="ivan@ludios.org",
classifiers=[
"Programming Language :: Python :: 3",
"Development Status :: 3 - Alpha",
"Intended Audience :: End Users/Desktop",
"License :: OSI Approved :: MIT License",
"Topic :: Internet :: WWW/HTTP",
],
scripts=["grab-site", "gs-server", "patched-wpull"],
packages=["libgrabsite"],
package_data={"libgrabsite": ["*.html"]},
install_requires=["wpull", "manhole", "lmdb", "autobahn", "aiohttp", "trollius"],
)

0 comments on commit 43d8a95

Please sign in to comment.
You can’t perform that action at this time.