-
Notifications
You must be signed in to change notification settings - Fork 18
/
_testsite.py
executable file
·100 lines (74 loc) · 3.31 KB
/
_testsite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python
# 'basesite.py' contains lots of functionality required for a ripper
from basesite import basesite
"""
Example class for ripping sites.
Contains skeleton code for creating a new ripper.
Inherits functionality from abstract 'basesite' super-class
We have to override:
* sanitize_url() - Ensures URL is rippable, alters URL as needed
* get_dir() - Creates unique working directory for album
* download() - Downloads images from the album
There's lots of helpful methods, such as:
* self.log() - writes text to 'log.txt'
* self.debug() - prints text to stderr (only when debugging is enabled)
* self.download_image() - downloads an image using threads
* self.wait_for_threads() - waits for threads to finish
* self.hit_image_limit - True if we hit the max number of images
* self.get_size() - gets size of a file (in bytes)
* self.create_thumb() - creates thumbnail of a file
And helpful fields:
* self.url - URL of album to be downloaded - set by sanitize_url()
* self.working_dir - working directory for this album - set by get_dir()
* self.max_threads - maximum number of threads to run at one time
* self.thread_count - current number of threads running
"""
class testsite(basesite):
""" Verify [and alter] URL to an acceptable format """
def sanitize_url(self, url):
# If this site isn't in the URL, pass an empty exception
# This tells the main script to move onto the next ripper
if not 'testsite.com/' in url:
raise Exception('')
# If this ripper requires a specific URL, ensure we have that.
# Ex: We might require the URL contains "?galleryid=" or something like that
if not '/something/' in url:
raise Exception('required /something/ not found in URL')
# Strip hashtags and query strings from URL
# Ex: http://site.com/galleryid#image becomes http://site.com/galleryid
if '#' in url: url = url[:url.find('#')]
if '?' in url: url = url[:url.find('?')]
# Return the properly-formatted URL
return url
# This URL is stored in self.url
""" Discover directory path based on URL """
def get_dir(self, url):
# We need to return the directory name for this specific album
# We can enforce that the URL contains specific strings
# within the sanitize_url() method.
# For example, http://site.com/012345
# 012345 is a unique album id, it's specific to one album
# Get gallery ID after the last / in the URL
galleryid = url[url.rfind('/')+1:]
# Return the site prefix + the unique gallery name
return 'testsite_%s' % galleryid
""" Download images in album """
def download(self):
# Create & initialize working directory
self.init_dir()
# Get webpage source
r = self.web.get(self.url)
# Example "logging" statement, written to log.txt
# and included in the archive once completed.
self.log('loading %s' % self.url)
# Find all links on page
links = self.web.between(r, '<a href="', '"')
# Iterate over links
for index, link in enumerate(links):
# Download the image (threaded)
self.download_image(link, index + 1, total=len(links))
# Stop if we hit the maximum number of images
if self.hit_image_limit(): break
# Wait for existing threads to finish
# Also, delete working directory if album could not be downloaded
self.wait_for_threads()