{{obj.title|truncatechars:55 }}
+ + {% endif %} + {% comment %}TEXT If needed.
{% endcomment %} +diff --git a/archivebox-0.5.3/.pc/.quilt_patches b/archivebox-0.5.3/.pc/.quilt_patches new file mode 100644 index 0000000..6857a8d --- /dev/null +++ b/archivebox-0.5.3/.pc/.quilt_patches @@ -0,0 +1 @@ +debian/patches diff --git a/archivebox-0.5.3/.pc/.quilt_series b/archivebox-0.5.3/.pc/.quilt_series new file mode 100644 index 0000000..c206706 --- /dev/null +++ b/archivebox-0.5.3/.pc/.quilt_series @@ -0,0 +1 @@ +series diff --git a/archivebox-0.5.3/.pc/.version b/archivebox-0.5.3/.pc/.version new file mode 100644 index 0000000..0cfbf08 --- /dev/null +++ b/archivebox-0.5.3/.pc/.version @@ -0,0 +1 @@ +2 diff --git a/archivebox-0.5.3/.pc/applied-patches b/archivebox-0.5.3/.pc/applied-patches new file mode 100644 index 0000000..e69de29 diff --git a/archivebox-0.5.3/MANIFEST.in b/archivebox-0.5.3/MANIFEST.in new file mode 100644 index 0000000..c9ae153 --- /dev/null +++ b/archivebox-0.5.3/MANIFEST.in @@ -0,0 +1,4 @@ +graft archivebox +global-exclude .DS_Store +global-exclude __pycache__ +global-exclude *.pyc diff --git a/archivebox-0.5.3/PKG-INFO b/archivebox-0.5.3/PKG-INFO new file mode 100644 index 0000000..b6534de --- /dev/null +++ b/archivebox-0.5.3/PKG-INFO @@ -0,0 +1,591 @@ +Metadata-Version: 2.1 +Name: archivebox +Version: 0.5.3 +Summary: The self-hosted internet archive. +Home-page: https://github.com/ArchiveBox/ArchiveBox +Author: Nick Sweeting +Author-email: git@nicksweeting.com +License: MIT +Project-URL: Source, https://github.com/ArchiveBox/ArchiveBox +Project-URL: Documentation, https://github.com/ArchiveBox/ArchiveBox/wiki +Project-URL: Bug Tracker, https://github.com/ArchiveBox/ArchiveBox/issues +Project-URL: Changelog, https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog +Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap +Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community +Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations +Description:
+ "Your own personal internet archive" (网站存档 / 爬虫) ++ + + +
docker-compose
on any platform (recommended, everything included out-of-the-box)docker
on any platformapt
on Ubuntu >=20.04brew
on macOS >=10.13pip
on any platform+"Your own personal internet archive" (网站存档 / 爬虫) ++ + + +
docker-compose
on any platform (recommended, everything included out-of-the-box)docker
on any platformapt
on Ubuntu >=20.04brew
on macOS >=10.13pip
on any platform+ "Your own personal internet archive" (网站存档 / 爬虫) ++ + + +
docker-compose
on any platform (recommended, everything included out-of-the-box)docker
on any platformapt
on Ubuntu >=20.04brew
on macOS >=10.13pip
on any platform+"Your own personal internet archive" (网站存档 / 爬虫) ++ + + +
docker-compose
on any platform (recommended, everything included out-of-the-box)docker
on any platformapt
on Ubuntu >=20.04brew
on macOS >=10.13pip
on any platform{}
',
+ obj.url_hash[:8],
+ )
+
+ def title_str(self, obj):
+ canon = obj.as_link().canonical_outputs()
+ tags = ''.join(
+ format_html('{} ', tag.id, tag)
+ for tag in obj.tags.all()
+ if str(tag).strip()
+ )
+ return format_html(
+ ''
+ '{}
',
+ obj.url,
+ obj.url.split('://www.', 1)[-1].split('://', 1)[-1][:64],
+ )
+
+ def grid_view(self, request):
+
+ # cl = self.get_changelist_instance(request)
+
+ # Save before monkey patching to restore for changelist list view
+ saved_change_list_template = self.change_list_template
+ saved_list_per_page = self.list_per_page
+ saved_list_max_show_all = self.list_max_show_all
+
+ # Monkey patch here plus core_tags.py
+ self.change_list_template = 'admin/grid_change_list.html'
+ self.list_per_page = 20
+ self.list_max_show_all = self.list_per_page
+
+ # Call monkey patched view
+ rendered_response = self.changelist_view(request)
+
+ # Restore values
+ self.change_list_template = saved_change_list_template
+ self.list_per_page = saved_list_per_page
+ self.list_max_show_all = saved_list_max_show_all
+
+ return rendered_response
+
+
+ id_str.short_description = 'ID'
+ title_str.short_description = 'Title'
+ url_str.short_description = 'Original URL'
+
+ id_str.admin_order_field = 'id'
+ title_str.admin_order_field = 'title'
+ url_str.admin_order_field = 'url'
+
+class TagAdmin(admin.ModelAdmin):
+ list_display = ('slug', 'name', 'id')
+ sort_fields = ('id', 'name', 'slug')
+ readonly_fields = ('id',)
+ search_fields = ('id', 'name', 'slug')
+ fields = (*readonly_fields, 'name', 'slug')
+
+
+class ArchiveBoxAdmin(admin.AdminSite):
+ site_header = 'ArchiveBox'
+ index_title = 'Links'
+ site_title = 'Index'
+
+ def get_urls(self):
+ return [
+ path('core/snapshot/add/', self.add_view, name='Add'),
+ ] + super().get_urls()
+
+ def add_view(self, request):
+ if not request.user.is_authenticated:
+ return redirect(f'/admin/login/?next={request.path}')
+
+ request.current_app = self.name
+ context = {
+ **self.each_context(request),
+ 'title': 'Add URLs',
+ }
+
+ if request.method == 'GET':
+ context['form'] = AddLinkForm()
+
+ elif request.method == 'POST':
+ form = AddLinkForm(request.POST)
+ if form.is_valid():
+ url = form.cleaned_data["url"]
+ print(f'[+] Adding URL: {url}')
+ depth = 0 if form.cleaned_data["depth"] == "0" else 1
+ input_kwargs = {
+ "urls": url,
+ "depth": depth,
+ "update_all": False,
+ "out_dir": OUTPUT_DIR,
+ }
+ add_stdout = StringIO()
+ with redirect_stdout(add_stdout):
+ add(**input_kwargs)
+ print(add_stdout.getvalue())
+
+ context.update({
+ "stdout": ansi_to_html(add_stdout.getvalue().strip()),
+ "form": AddLinkForm()
+ })
+ else:
+ context["form"] = form
+
+ return render(template_name='add_links.html', request=request, context=context)
+
+admin.site = ArchiveBoxAdmin()
+admin.site.register(get_user_model())
+admin.site.register(Snapshot, SnapshotAdmin)
+admin.site.register(Tag, TagAdmin)
+admin.site.disable_action('delete_selected')
diff --git a/archivebox-0.5.3/archivebox/core/apps.py b/archivebox-0.5.3/archivebox/core/apps.py
new file mode 100644
index 0000000..26f78a8
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/apps.py
@@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class CoreConfig(AppConfig):
+ name = 'core'
diff --git a/archivebox-0.5.3/archivebox/core/forms.py b/archivebox-0.5.3/archivebox/core/forms.py
new file mode 100644
index 0000000..86b29bb
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/forms.py
@@ -0,0 +1,67 @@
+__package__ = 'archivebox.core'
+
+from django import forms
+
+from ..util import URL_REGEX
+from ..vendor.taggit_utils import edit_string_for_tags, parse_tags
+
+CHOICES = (
+ ('0', 'depth = 0 (archive just these URLs)'),
+ ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
+)
+
+from ..extractors import get_default_archive_methods
+
+ARCHIVE_METHODS = [
+ (name, name)
+ for name, _, _ in get_default_archive_methods()
+]
+
+
+class AddLinkForm(forms.Form):
+ url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
+ depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
+ archive_methods = forms.MultipleChoiceField(
+ required=False,
+ widget=forms.SelectMultiple,
+ choices=ARCHIVE_METHODS,
+ )
+class TagWidgetMixin:
+ def format_value(self, value):
+ if value is not None and not isinstance(value, str):
+ value = edit_string_for_tags(value)
+ return super().format_value(value)
+
+class TagWidget(TagWidgetMixin, forms.TextInput):
+ pass
+
+class TagField(forms.CharField):
+ widget = TagWidget
+
+ def clean(self, value):
+ value = super().clean(value)
+ try:
+ return parse_tags(value)
+ except ValueError:
+ raise forms.ValidationError(
+ "Please provide a comma-separated list of tags."
+ )
+
+ def has_changed(self, initial_value, data_value):
+ # Always return False if the field is disabled since self.bound_data
+ # always uses the initial value in this case.
+ if self.disabled:
+ return False
+
+ try:
+ data_value = self.clean(data_value)
+ except forms.ValidationError:
+ pass
+
+ if initial_value is None:
+ initial_value = []
+
+ initial_value = [tag.name for tag in initial_value]
+ initial_value.sort()
+
+ return initial_value != data_value
diff --git a/archivebox-0.5.3/archivebox/core/management/commands/archivebox.py b/archivebox-0.5.3/archivebox/core/management/commands/archivebox.py
new file mode 100644
index 0000000..a68b5d9
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/management/commands/archivebox.py
@@ -0,0 +1,18 @@
+__package__ = 'archivebox'
+
+from django.core.management.base import BaseCommand
+
+
+from .cli import run_subcommand
+
+
+class Command(BaseCommand):
+ help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
+
+ def add_arguments(self, parser):
+ parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
+ parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
+
+
+ def handle(self, *args, **kwargs):
+ run_subcommand(kwargs['subcommand'], args=kwargs['command_args'])
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0001_initial.py b/archivebox-0.5.3/archivebox/core/migrations/0001_initial.py
new file mode 100644
index 0000000..73ac78e
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0001_initial.py
@@ -0,0 +1,27 @@
+# Generated by Django 2.2 on 2019-05-01 03:27
+
+from django.db import migrations, models
+import uuid
+
+
+class Migration(migrations.Migration):
+
+ initial = True
+
+ dependencies = [
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='Snapshot',
+ fields=[
+ ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
+ ('url', models.URLField(unique=True)),
+ ('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)),
+ ('title', models.CharField(default=None, max_length=128, null=True)),
+ ('tags', models.CharField(default=None, max_length=256, null=True)),
+ ('added', models.DateTimeField(auto_now_add=True)),
+ ('updated', models.DateTimeField(default=None, null=True)),
+ ],
+ ),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0002_auto_20200625_1521.py b/archivebox-0.5.3/archivebox/core/migrations/0002_auto_20200625_1521.py
new file mode 100644
index 0000000..4811282
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0002_auto_20200625_1521.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.0.7 on 2020-06-25 15:21
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='timestamp',
+ field=models.CharField(default=None, max_length=32, null=True),
+ ),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0003_auto_20200630_1034.py b/archivebox-0.5.3/archivebox/core/migrations/0003_auto_20200630_1034.py
new file mode 100644
index 0000000..61fd472
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0003_auto_20200630_1034.py
@@ -0,0 +1,38 @@
+# Generated by Django 3.0.7 on 2020-06-30 10:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0002_auto_20200625_1521'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='added',
+ field=models.DateTimeField(auto_now_add=True, db_index=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='tags',
+ field=models.CharField(db_index=True, default=None, max_length=256, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='timestamp',
+ field=models.CharField(db_index=True, default=None, max_length=32, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='title',
+ field=models.CharField(db_index=True, default=None, max_length=128, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='updated',
+ field=models.DateTimeField(db_index=True, default=None, null=True),
+ ),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0004_auto_20200713_1552.py b/archivebox-0.5.3/archivebox/core/migrations/0004_auto_20200713_1552.py
new file mode 100644
index 0000000..6983662
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0004_auto_20200713_1552.py
@@ -0,0 +1,19 @@
+# Generated by Django 3.0.7 on 2020-07-13 15:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0003_auto_20200630_1034'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='timestamp',
+ field=models.CharField(db_index=True, default=None, max_length=32, unique=True),
+ preserve_default=False,
+ ),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0005_auto_20200728_0326.py b/archivebox-0.5.3/archivebox/core/migrations/0005_auto_20200728_0326.py
new file mode 100644
index 0000000..f367aeb
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0005_auto_20200728_0326.py
@@ -0,0 +1,28 @@
+# Generated by Django 3.0.7 on 2020-07-28 03:26
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0004_auto_20200713_1552'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='tags',
+ field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='title',
+ field=models.CharField(blank=True, db_index=True, max_length=128, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='updated',
+ field=models.DateTimeField(blank=True, db_index=True, null=True),
+ ),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox-0.5.3/archivebox/core/migrations/0006_auto_20201012_1520.py
new file mode 100644
index 0000000..694c990
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0006_auto_20201012_1520.py
@@ -0,0 +1,70 @@
+# Generated by Django 3.0.8 on 2020-10-12 15:20
+
+from django.db import migrations, models
+from django.utils.text import slugify
+
+def forwards_func(apps, schema_editor):
+ SnapshotModel = apps.get_model("core", "Snapshot")
+ TagModel = apps.get_model("core", "Tag")
+
+ db_alias = schema_editor.connection.alias
+ snapshots = SnapshotModel.objects.all()
+ for snapshot in snapshots:
+ tags = snapshot.tags
+ tag_set = (
+ set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
+ )
+ tag_set.discard("")
+
+ for tag in tag_set:
+ to_add, _ = TagModel.objects.get_or_create(name=tag, slug=slugify(tag))
+ snapshot.tags.add(to_add)
+
+
+def reverse_func(apps, schema_editor):
+ SnapshotModel = apps.get_model("core", "Snapshot")
+ TagModel = apps.get_model("core", "Tag")
+
+ db_alias = schema_editor.connection.alias
+ snapshots = SnapshotModel.objects.all()
+ for snapshot in snapshots:
+ tags = snapshot.tags.values_list("name", flat=True)
+ snapshot.tags_old = ",".join([tag for tag in tags])
+ snapshot.save()
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0005_auto_20200728_0326'),
+ ]
+
+ operations = [
+ migrations.RenameField(
+ model_name='snapshot',
+ old_name='tags',
+ new_name='tags_old',
+ ),
+ migrations.CreateModel(
+ name='Tag',
+ fields=[
+ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
+ ('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
+ ],
+ options={
+ 'verbose_name': 'Tag',
+ 'verbose_name_plural': 'Tags',
+ },
+ ),
+ migrations.AddField(
+ model_name='snapshot',
+ name='tags',
+ field=models.ManyToManyField(to='core.Tag'),
+ ),
+ migrations.RunPython(forwards_func, reverse_func),
+ migrations.RemoveField(
+ model_name='snapshot',
+ name='tags_old',
+ ),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0007_archiveresult.py b/archivebox-0.5.3/archivebox/core/migrations/0007_archiveresult.py
new file mode 100644
index 0000000..a780376
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0007_archiveresult.py
@@ -0,0 +1,97 @@
+# Generated by Django 3.0.8 on 2020-11-04 12:25
+
+import json
+from pathlib import Path
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+from config import CONFIG
+from index.json import to_json
+
+try:
+ JSONField = models.JSONField
+except AttributeError:
+ import jsonfield
+ JSONField = jsonfield.JSONField
+
+
+def forwards_func(apps, schema_editor):
+ from core.models import EXTRACTORS
+
+ Snapshot = apps.get_model("core", "Snapshot")
+ ArchiveResult = apps.get_model("core", "ArchiveResult")
+
+ snapshots = Snapshot.objects.all()
+ for snapshot in snapshots:
+ out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
+
+ try:
+ with open(out_dir / "index.json", "r") as f:
+ fs_index = json.load(f)
+ except Exception as e:
+ continue
+
+ history = fs_index["history"]
+
+ for extractor in history:
+ for result in history[extractor]:
+ ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"],
+ start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
+
+
+def verify_json_index_integrity(snapshot):
+ results = snapshot.archiveresult_set.all()
+ out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
+ with open(out_dir / "index.json", "r") as f:
+ index = json.load(f)
+
+ history = index["history"]
+ index_results = [result for extractor in history for result in history[extractor]]
+ flattened_results = [result["start_ts"] for result in index_results]
+
+ missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
+
+ for missing in missing_results:
+ index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
+ "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
+ "schema": "ArchiveResult", "status": missing.status})
+
+ json_index = to_json(index)
+ with open(out_dir / "index.json", "w") as f:
+ f.write(json_index)
+
+
+def reverse_func(apps, schema_editor):
+ Snapshot = apps.get_model("core", "Snapshot")
+ ArchiveResult = apps.get_model("core", "ArchiveResult")
+ for snapshot in Snapshot.objects.all():
+ verify_json_index_integrity(snapshot)
+
+ ArchiveResult.objects.all().delete()
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0006_auto_20201012_1520'),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='ArchiveResult',
+ fields=[
+ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('cmd', JSONField()),
+ ('pwd', models.CharField(max_length=256)),
+ ('cmd_version', models.CharField(max_length=32)),
+ ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
+ ('output', models.CharField(max_length=512)),
+ ('start_ts', models.DateTimeField()),
+ ('end_ts', models.DateTimeField()),
+ ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
+ ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
+ ],
+ ),
+ migrations.RunPython(forwards_func, reverse_func),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/0008_auto_20210105_1421.py b/archivebox-0.5.3/archivebox/core/migrations/0008_auto_20210105_1421.py
new file mode 100644
index 0000000..e5b3387
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/migrations/0008_auto_20210105_1421.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-01-05 14:21
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0007_archiveresult'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='cmd_version',
+ field=models.CharField(blank=True, default=None, max_length=32, null=True),
+ ),
+ ]
diff --git a/archivebox-0.5.3/archivebox/core/migrations/__init__.py b/archivebox-0.5.3/archivebox/core/migrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/archivebox-0.5.3/archivebox/core/mixins.py b/archivebox-0.5.3/archivebox/core/mixins.py
new file mode 100644
index 0000000..538ca1e
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/mixins.py
@@ -0,0 +1,23 @@
+from django.contrib import messages
+
+from archivebox.search import query_search_index
+
+class SearchResultsAdminMixin(object):
+ def get_search_results(self, request, queryset, search_term):
+ ''' Enhances the search queryset with results from the search backend.
+ '''
+ qs, use_distinct = \
+ super(SearchResultsAdminMixin, self).get_search_results(
+ request, queryset, search_term)
+
+ search_term = search_term.strip()
+ if not search_term:
+ return qs, use_distinct
+ try:
+ qsearch = query_search_index(search_term)
+ except Exception as err:
+ messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
+ else:
+ qs = queryset & qsearch
+ finally:
+ return qs, use_distinct
diff --git a/archivebox-0.5.3/archivebox/core/models.py b/archivebox-0.5.3/archivebox/core/models.py
new file mode 100644
index 0000000..13d75b6
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/models.py
@@ -0,0 +1,194 @@
+__package__ = 'archivebox.core'
+
+import uuid
+
+from django.db import models, transaction
+from django.utils.functional import cached_property
+from django.utils.text import slugify
+from django.db.models import Case, When, Value, IntegerField
+
+from ..util import parse_date
+from ..index.schema import Link
+from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
+
+EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+STATUS_CHOICES = [
+ ("succeeded", "succeeded"),
+ ("failed", "failed"),
+ ("skipped", "skipped")
+]
+
+try:
+ JSONField = models.JSONField
+except AttributeError:
+ import jsonfield
+ JSONField = jsonfield.JSONField
+
+
+class Tag(models.Model):
+ """
+ Based on django-taggit model
+ """
+ name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
+ slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
+
+ class Meta:
+ verbose_name = "Tag"
+ verbose_name_plural = "Tags"
+
+ def __str__(self):
+ return self.name
+
+ def slugify(self, tag, i=None):
+ slug = slugify(tag)
+ if i is not None:
+ slug += "_%d" % i
+ return slug
+
+ def save(self, *args, **kwargs):
+ if self._state.adding and not self.slug:
+ self.slug = self.slugify(self.name)
+
+ with transaction.atomic():
+ slugs = set(
+ type(self)
+ ._default_manager.filter(slug__startswith=self.slug)
+ .values_list("slug", flat=True)
+ )
+
+ i = None
+ while True:
+ slug = self.slugify(self.name, i)
+ if slug not in slugs:
+ self.slug = slug
+ return super().save(*args, **kwargs)
+ i = 1 if i is None else i+1
+ else:
+ return super().save(*args, **kwargs)
+
+
+class Snapshot(models.Model):
+ id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
+
+ url = models.URLField(unique=True)
+ timestamp = models.CharField(max_length=32, unique=True, db_index=True)
+
+ title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
+
+ added = models.DateTimeField(auto_now_add=True, db_index=True)
+ updated = models.DateTimeField(null=True, blank=True, db_index=True)
+ tags = models.ManyToManyField(Tag)
+
+ keys = ('url', 'timestamp', 'title', 'tags', 'updated')
+
+ def __repr__(self) -> str:
+ title = self.title or '-'
+ return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
+
+ def __str__(self) -> str:
+ title = self.title or '-'
+ return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
+
+ @classmethod
+ def from_json(cls, info: dict):
+ info = {k: v for k, v in info.items() if k in cls.keys}
+ return cls(**info)
+
+ def as_json(self, *args) -> dict:
+ args = args or self.keys
+ return {
+ key: getattr(self, key)
+ if key != 'tags' else self.tags_str()
+ for key in args
+ }
+
+ def as_link(self) -> Link:
+ return Link.from_json(self.as_json())
+
+ def as_link_with_details(self) -> Link:
+ from ..index import load_link_details
+ return load_link_details(self.as_link())
+
+ def tags_str(self) -> str:
+ return ','.join(self.tags.order_by('name').values_list('name', flat=True))
+
+ @cached_property
+ def bookmarked(self):
+ return parse_date(self.timestamp)
+
+ @cached_property
+ def is_archived(self):
+ return self.as_link().is_archived
+
+ @cached_property
+ def num_outputs(self):
+ return self.archiveresult_set.filter(status='succeeded').count()
+
+ @cached_property
+ def url_hash(self):
+ return self.as_link().url_hash
+
+ @cached_property
+ def base_url(self):
+ return self.as_link().base_url
+
+ @cached_property
+ def link_dir(self):
+ return self.as_link().link_dir
+
+ @cached_property
+ def archive_path(self):
+ return self.as_link().archive_path
+
+ @cached_property
+ def archive_size(self):
+ return self.as_link().archive_size
+
+ @cached_property
+ def history(self):
+ # TODO: use ArchiveResult for this instead of json
+ return self.as_link_with_details().history
+
+ @cached_property
+ def latest_title(self):
+ if ('title' in self.history
+ and self.history['title']
+ and (self.history['title'][-1].status == 'succeeded')
+ and self.history['title'][-1].output.strip()):
+ return self.history['title'][-1].output.strip()
+ return None
+
+ def save_tags(self, tags=()):
+ tags_id = []
+ for tag in tags:
+ tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
+ self.tags.clear()
+ self.tags.add(*tags_id)
+
+
+class ArchiveResultManager(models.Manager):
+ def indexable(self, sorted: bool = True):
+ INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
+ qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
+
+ if sorted:
+ precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
+ qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
+ return qs
+
+
+class ArchiveResult(models.Model):
+ snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
+ cmd = JSONField()
+ pwd = models.CharField(max_length=256)
+ cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True)
+ output = models.CharField(max_length=512)
+ start_ts = models.DateTimeField()
+ end_ts = models.DateTimeField()
+ status = models.CharField(max_length=16, choices=STATUS_CHOICES)
+ extractor = models.CharField(choices=EXTRACTORS, max_length=32)
+
+ objects = ArchiveResultManager()
+
+ def __str__(self):
+ return self.extractor
diff --git a/archivebox-0.5.3/archivebox/core/settings.py b/archivebox-0.5.3/archivebox/core/settings.py
new file mode 100644
index 0000000..e8ed6b1
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/settings.py
@@ -0,0 +1,165 @@
+__package__ = 'archivebox.core'
+
+import os
+import sys
+
+from pathlib import Path
+from django.utils.crypto import get_random_string
+
+from ..config import ( # noqa: F401
+ DEBUG,
+ SECRET_KEY,
+ ALLOWED_HOSTS,
+ PACKAGE_DIR,
+ ACTIVE_THEME,
+ TEMPLATES_DIR_NAME,
+ SQL_INDEX_FILENAME,
+ OUTPUT_DIR,
+)
+
+
+IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
+IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
+IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
+
+################################################################################
+### Django Core Settings
+################################################################################
+
+WSGI_APPLICATION = 'core.wsgi.application'
+ROOT_URLCONF = 'core.urls'
+
+LOGIN_URL = '/accounts/login/'
+LOGOUT_REDIRECT_URL = '/'
+PASSWORD_RESET_URL = '/accounts/password_reset/'
+APPEND_SLASH = True
+
+INSTALLED_APPS = [
+ 'django.contrib.auth',
+ 'django.contrib.contenttypes',
+ 'django.contrib.sessions',
+ 'django.contrib.messages',
+ 'django.contrib.staticfiles',
+ 'django.contrib.admin',
+
+ 'core',
+
+ 'django_extensions',
+]
+
+
+MIDDLEWARE = [
+ 'django.middleware.security.SecurityMiddleware',
+ 'django.contrib.sessions.middleware.SessionMiddleware',
+ 'django.middleware.common.CommonMiddleware',
+ 'django.middleware.csrf.CsrfViewMiddleware',
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
+ 'django.contrib.messages.middleware.MessageMiddleware',
+]
+
+AUTHENTICATION_BACKENDS = [
+ 'django.contrib.auth.backends.ModelBackend',
+]
+
+
+################################################################################
+### Staticfile and Template Settings
+################################################################################
+
+STATIC_URL = '/static/'
+
+STATICFILES_DIRS = [
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME / 'static'),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default' / 'static'),
+]
+
+TEMPLATE_DIRS = [
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default'),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
+]
+
+TEMPLATES = [
+ {
+ 'BACKEND': 'django.template.backends.django.DjangoTemplates',
+ 'DIRS': TEMPLATE_DIRS,
+ 'APP_DIRS': True,
+ 'OPTIONS': {
+ 'context_processors': [
+ 'django.template.context_processors.debug',
+ 'django.template.context_processors.request',
+ 'django.contrib.auth.context_processors.auth',
+ 'django.contrib.messages.context_processors.messages',
+ ],
+ },
+ },
+]
+
+
+################################################################################
+### External Service Settings
+################################################################################
+
+DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME
+DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", DATABASE_FILE)
+
+DATABASES = {
+ 'default': {
+ 'ENGINE': 'django.db.backends.sqlite3',
+ 'NAME': DATABASE_NAME,
+ }
+}
+
+EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
+
+
+################################################################################
+### Security Settings
+################################################################################
+
+SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
+
+ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
+
+SECURE_BROWSER_XSS_FILTER = True
+SECURE_CONTENT_TYPE_NOSNIFF = True
+
+CSRF_COOKIE_SECURE = False
+SESSION_COOKIE_SECURE = False
+SESSION_COOKIE_DOMAIN = None
+SESSION_COOKIE_AGE = 1209600 # 2 weeks
+SESSION_EXPIRE_AT_BROWSER_CLOSE = False
+SESSION_SAVE_EVERY_REQUEST = True
+
+AUTH_PASSWORD_VALIDATORS = [
+ {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
+ {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
+ {'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator'},
+ {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
+]
+
+
+################################################################################
+### Shell Settings
+################################################################################
+
+SHELL_PLUS = 'ipython'
+SHELL_PLUS_PRINT_SQL = False
+IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
+IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
+if IS_SHELL:
+ os.environ['PYTHONSTARTUP'] = str(Path(PACKAGE_DIR) / 'core' / 'welcome_message.py')
+
+
+################################################################################
+### Internationalization & Localization Settings
+################################################################################
+
+LANGUAGE_CODE = 'en-us'
+TIME_ZONE = 'UTC'
+USE_I18N = False
+USE_L10N = False
+USE_TZ = False
+
+DATETIME_FORMAT = 'Y-m-d g:iA'
+SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
diff --git a/archivebox-0.5.3/archivebox/core/templatetags/__init__.py b/archivebox-0.5.3/archivebox/core/templatetags/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/archivebox-0.5.3/archivebox/core/templatetags/core_tags.py b/archivebox-0.5.3/archivebox/core/templatetags/core_tags.py
new file mode 100644
index 0000000..25f0685
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/templatetags/core_tags.py
@@ -0,0 +1,47 @@
+from django import template
+from django.urls import reverse
+from django.contrib.admin.templatetags.base import InclusionAdminNode
+from django.templatetags.static import static
+
+
+from typing import Union
+
+from core.models import ArchiveResult
+
+register = template.Library()
+
+@register.simple_tag
+def snapshot_image(snapshot):
+ result = ArchiveResult.objects.filter(snapshot=snapshot, extractor='screenshot', status='succeeded').first()
+ if result:
+ return reverse('LinkAssets', args=[f'{str(snapshot.timestamp)}/{result.output}'])
+
+ return static('archive.png')
+
+@register.filter
+def file_size(num_bytes: Union[int, float]) -> str:
+ for count in ['Bytes','KB','MB','GB']:
+ if num_bytes > -1024.0 and num_bytes < 1024.0:
+ return '%3.1f %s' % (num_bytes, count)
+ num_bytes /= 1024.0
+ return '%3.1f %s' % (num_bytes, 'TB')
+
+def result_list(cl):
+ """
+ Monkey patched result
+ """
+ num_sorted_fields = 0
+ return {
+ 'cl': cl,
+ 'num_sorted_fields': num_sorted_fields,
+ 'results': cl.result_list,
+ }
+
+@register.tag(name='snapshots_grid')
+def result_list_tag(parser, token):
+ return InclusionAdminNode(
+ parser, token,
+ func=result_list,
+ template_name='snapshots_grid.html',
+ takes_context=False,
+ )
diff --git a/archivebox-0.5.3/archivebox/core/tests.py b/archivebox-0.5.3/archivebox/core/tests.py
new file mode 100644
index 0000000..4d66077
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/tests.py
@@ -0,0 +1,3 @@
+#from django.test import TestCase
+
+# Create your tests here.
diff --git a/archivebox-0.5.3/archivebox/core/urls.py b/archivebox-0.5.3/archivebox/core/urls.py
new file mode 100644
index 0000000..b8e4baf
--- /dev/null
+++ b/archivebox-0.5.3/archivebox/core/urls.py
@@ -0,0 +1,36 @@
+from django.contrib import admin
+
+from django.urls import path, include
+from django.views import static
+from django.conf import settings
+from django.views.generic.base import RedirectView
+
+from core.views import MainIndex, LinkDetails, PublicArchiveView, AddView
+
+
+# print('DEBUG', settings.DEBUG)
+
+urlpatterns = [
+ path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
+ path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
+
+ path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
+
+ path('archive/', RedirectView.as_view(url='/')),
+ path('archive/