github
Advanced Search
  • Home
  • Pricing and Signup
  • Explore GitHub
  • Blog
  • Login

bhyde / Ghostery-Scrap

  • Admin
  • Watch Unwatch
  • Fork
  • Your Fork
  • Pull Request
  • Download Source
    • 2
    • 1
  • Source
  • Commits
  • Network (1)
  • Issues (0)
  • Downloads (0)
  • Wiki (1)
  • Graphs
  • Tree: 60261f9

click here to add a description

click here to add a homepage

  • Branches (1)
    • master
  • Tags (0)
Sending Request…
Enable Donations

Pledgie Donations

Once activated, we'll place the following badge in your repository's detail box:
Pledgie_example
This service is courtesy of Pledgie.

Lame: scrape info from the directory of trackers — Read more

  cancel

  cancel
  • Private
  • Read-Only
  • HTTP Read-Only

This URL has Read+Write access

a little sorting 
bhyde (author)
Tue Jun 23 17:00:45 -0700 2009
commit  60261f915c9a257b4f9acf40970993e2f7fa23b2
tree    10bebc84ea835d39a5a57ccd1ec2c34498617e2f
parent  b6c8175334e4ca1b19bc109a7c8b10dbde04ae77
Ghostery-Scrap / scrape.lisp scrape.lisp
100644 69 lines (52 sloc) 2.071 kb
edit raw blame history
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
(in-package "COMMON-LISP-USER")
 
;;; in the tradition of forth
 
(defvar *pad*)
 
(defun get-page (url &key parameters)
  (setf *pad*
        (drakma:http-request url :parameters parameters)))
 
(defun pluck (pattern)
  (multiple-value-bind (match regs) (cl-ppcre:scan-to-strings pattern *pad*)
    (declare (ignore match))
    (if regs
        (svref regs 0)
        nil)))
 
 
 
;;; List of the known bugs gleaned from ghostery sources.
 
(defvar *bugs* nil)
 
(defun establish-bug-list-if-necessary ()
  (unless *bugs*
    (reset-bug-list)))
 
(defun reset-bug-list ()
  (let* (result
         (url "http://ghostery.googlecode.com/svn/trunk/firefox/ghostery-statusbar/ghostery/chrome/content/ghostery-bugs.js")
         (page (drakma:http-request url)))
    (cl-ppcre:do-register-groups (bug) ("\"name\": \"([^\"]*)\"" page)
      (push bug result))
    (setf *bugs* (nreverse result))))
 
 
;;; The actual scrapping
 
(defun scrape-tracker-page (url)
  "Pluck junk out of pages like http://www.ghostery.com/apps/chartbeat."
  (get-page url)
  (format t "~&~6A ~30A ~20A ~A"
          (let ((n (pluck "found on.*<b>([\\d,]+)</b>")))
            (when n
              (parse-integer (delete #\, n))))
          (pluck "Website: <a rel=\"nofollow\" href=\"([^\"]*)\"")
          (pluck "<h1>([^<]*)</h1>")
          url))
 
(defun scrape-trackers-via-google ()
  "Use google to find apps pages on ghostery.com, then scrape em."
  (loop
    with u = "http://www.google.com/search?q=site:ghostery.com+%22Application+Owner%22&hl=en&sa=N"
    for i in '("0" "10" "20" "30" "40")
    as p = (get-page u :parameters `(("start" . ,i)))
    do
 (cl-ppcre:do-register-groups (link)
     ("href=\"(http://www.ghostery.com/apps/[^\"]*)\"" p)
   (scrape-tracker-page link))))
 
(defun scrape-trackers-via-bugs ()
  (establish-bug-list-if-necessary)
  (loop for bug in *bugs*
        as url = (concatenate 'string "http://www.ghostery.com/apps/"
                              (nstring-downcase
                               (substitute #\_ #\space bug)))
        do (scrape-tracker-page url)))
 
 
Blog | Support | Training | Contact | API | Status | Twitter | Help | Security
© 2010 GitHub Inc. All rights reserved. | Terms of Service | Privacy Policy
Powered by the Dedicated Servers and
Cloud Computing of Rackspace Hosting®
Dedicated Server