heyZeus / clj-web-crawler

A wrapper around Apache commons-client for the Clojure programming language.

This URL has Read+Write access

clj-web-crawler / clj_web_crawler.clj
100644 139 lines (117 sloc) 4.645 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
(ns clj-web-crawler
  (:import (org.apache.commons.httpclient HttpClient NameValuePair URI HttpStatus)
           (org.apache.commons.httpclient.cookie CookiePolicy CookieSpec)
           (org.apache.commons.httpclient.methods GetMethod PostMethod DeleteMethod
                                                  TraceMethod HeadMethod PutMethod))
  (:use [clojure.contrib.duck-streams :only (slurp*)]))
 
(defn redirect-location
  "Returns the redirection location string in the method, nil or false if
not being redirected."
  [method]
  (let [status-code (.getStatusCode method)
                    header (.getResponseHeader method "location")]
    (if (or (= status-code (HttpStatus/SC_MOVED_TEMPORARILY))
            (= status-code (HttpStatus/SC_MOVED_PERMANENTLY))
            (= status-code (HttpStatus/SC_SEE_OTHER))
            (= status-code (HttpStatus/SC_TEMPORARY_REDIRECT)))
      (if-let [location (and header (.getValue header))]
        location))))
 
(defn to-str
  "Converts a value to a string, accounts for keyword"
  [s]
  (if (keyword? s) (name s) (str s)))
 
(defn keys-values-to-strs
  "Converts the given map keys and values to strings."
  [map1]
  (apply hash-map (mapcat (fn [[k v]] [(to-str k) (to-str v)]) map1)))
 
(defn method
  "Creates a commons-client method type object with the given path and type.
A type can be one of: :post, :get, :put, :delete, :trace or :head. If no
type is supplied :get is the default. You can supply a url-params hash like:
{:login \"foo\" :password \"bar\"}."
  ([path type url-params]
   (let [key-type (cond
                    (> (count url-params) 0) :post
                    (nil? type) :get
                    :else (keyword type))
         p (if (.startsWith path "/") path (str \/ path))
         m (cond
             (= :post key-type) (PostMethod. p)
             (= :delete key-type) (DeleteMethod. p)
             (= :put key-type) (PutMethod. path)
             (= :trace key-type) (TraceMethod. p)
             (= :head key-type) (HeadMethod. p)
             :else (GetMethod. p))]
     (doseq [[k v] (keys-values-to-strs url-params)]
       (.addParameter m k v))
     m))
  ([path type] (method path type nil))
  ([path] (method path nil nil)))
 
(defn client
  "Creates a HttpClient for the given server."
  [host]
  (let [c (HttpClient.)]
    (.. c (getHostConfiguration) (setHost (URI. host true)))
    c))
 
(defmacro crawl
  "Sends an HTTP request to the server. Pass in a body to examine
the status code, response, etc. All resource associated with
the method will be freed up at the end of the macro."
  ([#^org.apache.commons.httpclient.HttpClient server
    #^org.apache.commons.httpclient.HttpMethodBase method & body]
    `(try
       (.executeMethod ~server ~method)
       ~@body
       (finally (.releaseConnection ~method))))
  ([server] (crawl server (method "/"))))
 
(defn response-str
  "Returns the response from the method as a string."
  ([method]
   ; uses slurp* here otherwise we get a annoying warning from commons-client
   (slurp* (.getResponseBodyAsStream method)))
  ([method client]
   (let [redirect (redirect-location method)
         new-method (if redirect (method redirect))]
     (if new-method
       (crawl client new-method
         (response-str new-method))
       (response-str method)))))
 
(defmulti crawl-response (fn [server method] [(class server) (class method)]))
 
(defmethod crawl-response
   [String String] [server http-method]
   (let [c (client server)
         m (method http-method)]
     (crawl c m
       (response-str m c))))
 
(defmethod crawl-response
  [String org.apache.commons.httpclient.HttpMethodBase] [server http-method]
  (let [c (client server)]
    (crawl c http-method
      (response-str http-method c))))
 
(defmethod crawl-response
  [org.apache.commons.httpclient.HttpClient String]
  [server http-method]
  (let [m (method http-method)]
    (crawl server m
      (response-str m server))))
 
(defmethod crawl-response
  [org.apache.commons.httpclient.HttpClient org.apache.commons.httpclient.HttpMethodBase]
  [server http-method]
  (crawl server http-method
    (response-str http-method server)))
 
(defn cookies
  "Convience function to get the cookies from the client."
  [client]
  (.. client getState getCookies))
 
(defn print-cookies
  "Prints the cookies from the client."
  [client]
  (doseq [c (cookies client)] (println c)))
 
(defn cookie-map
  [client]
  (reduce (fn [ret cookie] (conj ret [(.getName cookie) (.getValue cookie)]))
          {}
          (cookies client)))
 
(defn cookie-names
  [client]
  (set (keys (cookie-map client))))