initial

ruebot · Sep 19, 2012 · 557f923 · 557f923
commit 557f923
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,13 @@
+# Internet Archive collection torrent collector
+
+## Description
+
+Grabs all of the torrents for a given collection via collection RSS feed.
+
+## Usage
+
+    python ia-torrent.py
+
+## License
+
+![CC0](http://i.creativecommons.org/p/zero/1.0/88x31.png "CC0")
diff --git a/ia-torrent.py b/ia-torrent.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import sys, os, re, urllib
+from BeautifulSoup import BeautifulSoup
+
+#if len(sys.argv) != 2:
+#  print('Please verify RSS feed & download location')
+#  sys.exit(-1)
+
+feed = 'http://archive.org/services/collection-rss.php?collection=YorkUniversity&query=%28collection%3Ayorkuniversity%20AND%20format%3Apdf%29%20AND%20-mediatype%3Acollection' #sys.argv[1]      #user supplied RSS feed
+download = '/tmp/ia-torrent'  #sys.argv[2]   #user supplied storage location
+
+data = urllib.urlopen(feed)
+soup = BeautifulSoup(data)
+items = soup.findAll('item')
+
+for item in items:
+  title = item.find('title').string.strip()
+  link = item.find('guid').string.strip()
+  matchObj = re.match(r'https://archive.org/details/(.*)', link, re.M|re.I)
+  identifier = matchObj.group(1)
+  urllib.urlretrieve("https://archive.org/download" + identifier + "/" + identifier +"_archive.torrent", os.path.join(download, identifier + ".torrent"))
+  print "Snatching " + title + "\n"