#!/usr/bin/env python
"""Prototype Mail Archiver"""
__version__ = "0.2" #
__author__ = "Rui Carmo (http://the.taoofmac.com)"
__copyright__ = "(C) 2006 Rui Carmo. Distributed under BSD license."
__contributors__ = "Based on newspipe source code."
import urllib2, urlparse, cStringIO, BeautifulSoup, sha, gzip, re
import base64, mimetools, MimeWriter
import smtplib, Queue
USER_AGENT='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
class Gatherer:
def __init__(self, url):
self.url = url
self.html = ''
self.parts = Queue.Queue()
def fetch(self, url, referrer=None):
result = {'url':url}
headers = {'User-Agent':USER_AGENT,'Accept-encoding':'gzip'}
if referrer:
headers['Referer'] = referrer
req = urllib2.Request(url,None,headers)
try:
f = urllib2.urlopen(req)
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print 'We failed to reach a server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request for %s' % url
print 'Error code:', e.code
return None
result['data'] = f.read()
if hasattr(f,'headers'):
if f.headers.get('content-encoding') == 'gzip':
result['data'] = gzip.GzipFile(fileobj=cStringIO.StringIO(result['data'])).read()
result['last-modified'] = f.headers.get('Last-Modified')
result['content-type'] = f.headers.get('Content-Type')
if hasattr(f,'url'):
result['url'] = f.url
if hasattr(f,'status'):
result['status'] = f.status
if hasattr(f,'code'):
result['code'] = f.code
f.close()
return result
def buildURL(self, url, referrer='', base=''):
if base == '':
result = urlparse.urljoin(referrer,url)
else:
result = urlparse.urljoin(base,url)
return result
def run(self,headers):
self.spider()
return self.assemble(headers)
def filename(self,response):
parts = urlparse.urlsplit(response['url'])
try:
(path,extension) = parts.path.split('.',1)
except:
(dummy,extension) = response['content-type'].split('/',1)
pass
if extension == 'jpeg':
extension = 'jpg'
buffer = sha.sha(response['url'])
result = buffer.hexdigest() + '.' + extension
return result
def spider(self):
parts = urlparse.urlsplit(self.url)
if parts[0].lower() != 'http':
return
# Kick off fetching by getting the base URL
response = self.fetch(self.url)
if response == None:
# fail silently
return
soup = BeautifulSoup.BeautifulSoup(response['data'])
# Remove all scripting and other nuisances
for script in soup('script'):
script.extract()
for embed in soup('embed'):
embed.extract()
for obj in soup('object'):
obj.extract()
for iframe in soup('iframe'):
iframe.extract()
# grab any base href
base = ''
try:
node = soup('base')[0]
base = node['href']
node.extract()
except: pass
total_css = ''
# Grab only screen CSS - which is what a browser would do
for style in soup('link', rel='stylesheet', media=re.compile('screen')):
url = self.buildURL(style['href'],self.url,base)
css = self.fetch(url,self.url)
if css != None:
name = self.filename(css)
style['href'] = 'cid:'+name
# try grabbing images referenced in CSS
for m in re.finditer("url\((.+)\)",css['data']):
rel = m.group(1)
if rel.startswith("'") or rel.startswith('"'):
rel = rel[1:-1]
url = self.buildURL(rel,self.url,base)
response = self.fetch(url,self.url)
if response != None:
name = self.filename(response)
css['data'] = css['data'].replace(rel, 'cid:'+name)
self.parts.put((name, {'data':response['data'],'content-type':response['content-type']}))
# self.parts[name] = {'data':css['data'],'content-type':css['content-type']}
# Accrete all stylesheets into a text buffer
total_css = total_css + "\n" + css['data']
# remove all CSS link tags from the document (they will only confuse the MUA)
for style in soup('link', {'rel':'stylesheet'}):
style.extract()
# Get the head tag
head = soup('head')[0]
# Assemble a style tag with the accreted CSS and insert it
style = BeautifulSoup.Tag(soup,'style')
css = BeautifulSoup.NavigableString(total_css)
style.insert(0,css)
head.insert(0,style)
for img in soup('img'):
url = self.buildURL(img['src'],self.url,base)
response = self.fetch(url,self.url)
if response != None:
name = self.filename(response)
img['src'] = 'cid:'+name
self.parts.put((name,{'data':response['data'],'content-type':response['content-type']}))
self.html = soup.prettify()
def assemble(self,headers):
buffer = cStringIO.StringIO(self.html)
out = cStringIO.StringIO() # output buffer for our message
writer = MimeWriter.MimeWriter(out)
for key in headers.keys():
writer.addheader(key, headers[key])
writer.addheader("MIME-Version", "1.0")
writer.startmultipartbody("alternative", boundary="F7A30D4E-ED0B-4BBE-8A45-D4E88DBC2FBF")
writer.flushheaders()
if self.parts:
htmlpart = writer.nextpart()
htmlpart.startmultipartbody("related", boundary="F7A30D4E-ED0B-4BBE-8A45-3244235533221")
subpart = htmlpart.nextpart()
else:
subpart = writer.nextpart()
subpart.addheader("Content-Transfer-Encoding", "quoted-printable")
pout = subpart.startbody("text/html", [("charset", 'utf-8')])
mimetools.encode(buffer, pout, 'quoted-printable')
if self.parts:
while self.parts.empty() == False:
(key,data) = self.parts.get()
subpart = htmlpart.nextpart()
subpart.addheader("Content-Transfer-Encoding", "base64")
subpart.addheader("Content-ID", "<" + key + ">")
subpart.addheader("Content-Location", key)
subpart.addheader("Content-Disposition", "inline; filename=\"" + key + "\"" )
f = subpart.startbody(data['content-type'], [["name", key]])
b64 = base64.encodestring(data['data'])
f.write(b64)
if self.parts:
htmlpart.lastpart()
writer.lastpart()
buffer = out.getvalue()
out.close()
return buffer
if __name__ == '__main__':
url = 'http://slashdot.org'
sender = recipient = 'me@myaccount'
g = Gatherer(url)
buffer = g.run({'Subject':'Archive of %s' % url, 'X-Keywords':'archive'})
smtp = smtplib.SMTP('myserver')
smtp.sendmail(sender,recipient,buffer)
smtp.quit()