public
Description: Lame: scrape info from the directory of trackers
Homepage:
Clone URL: git://github.com/bhyde/Ghostery-Scrap.git
bhyde (author)
Tue Jun 23 17:00:45 -0700 2009
commit  60261f915c9a257b4f9acf40970993e2f7fa23b2
tree    10bebc84ea835d39a5a57ccd1ec2c34498617e2f
parent  b6c8175334e4ca1b19bc109a7c8b10dbde04ae77
Ghostery-Scrap / scrape.lisp
100644 69 lines (52 sloc) 2.071 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
(in-package "COMMON-LISP-USER")
 
;;; in the tradition of forth
 
(defvar *pad*)
 
(defun get-page (url &key parameters)
  (setf *pad*
        (drakma:http-request url :parameters parameters)))
 
(defun pluck (pattern)
  (multiple-value-bind (match regs) (cl-ppcre:scan-to-strings pattern *pad*)
    (declare (ignore match))
    (if regs
        (svref regs 0)
        nil)))
 
 
 
;;; List of the known bugs gleaned from ghostery sources.
 
(defvar *bugs* nil)
 
(defun establish-bug-list-if-necessary ()
  (unless *bugs*
    (reset-bug-list)))
 
(defun reset-bug-list ()
  (let* (result
         (url "http://ghostery.googlecode.com/svn/trunk/firefox/ghostery-statusbar/ghostery/chrome/content/ghostery-bugs.js")
         (page (drakma:http-request url)))
    (cl-ppcre:do-register-groups (bug) ("\"name\": \"([^\"]*)\"" page)
      (push bug result))
    (setf *bugs* (nreverse result))))
 
 
;;; The actual scrapping
 
(defun scrape-tracker-page (url)
  "Pluck junk out of pages like http://www.ghostery.com/apps/chartbeat."
  (get-page url)
  (format t "~&~6A ~30A ~20A ~A"
          (let ((n (pluck "found on.*<b>([\\d,]+)</b>")))
            (when n
              (parse-integer (delete #\, n))))
          (pluck "Website: <a rel=\"nofollow\" href=\"([^\"]*)\"")
          (pluck "<h1>([^<]*)</h1>")
          url))
 
(defun scrape-trackers-via-google ()
  "Use google to find apps pages on ghostery.com, then scrape em."
  (loop
    with u = "http://www.google.com/search?q=site:ghostery.com+%22Application+Owner%22&hl=en&sa=N"
    for i in '("0" "10" "20" "30" "40")
    as p = (get-page u :parameters `(("start" . ,i)))
    do
 (cl-ppcre:do-register-groups (link)
     ("href=\"(http://www.ghostery.com/apps/[^\"]*)\"" p)
   (scrape-tracker-page link))))
 
(defun scrape-trackers-via-bugs ()
  (establish-bug-list-if-necessary)
  (loop for bug in *bugs*
        as url = (concatenate 'string "http://www.ghostery.com/apps/"
                              (nstring-downcase
                               (substitute #\_ #\space bug)))
        do (scrape-tracker-page url)))