dbr / zp_grabber

Grabs the .flv links for Zero Punctuation episodes

This URL has Read+Write access

zp_grabber / zp_grabber.py
100644 255 lines (210 sloc) 8.643 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#!/usr/bin/env python
import os, sys, urllib, urllib2, tempfile, re
from cache import CacheHandler
from optparse import OptionParser
from BeautifulSoup import BeautifulSoup
 
def get_cache_dir(suffix):
    tmp = tempfile.gettempdir()
    tmppath = os.path.join(tmp, suffix)
    if not os.path.isdir(tmppath):
        os.mkdir(tmppath)
    return tmppath
 
cached_opener = urllib2.build_opener(CacheHandler(get_cache_dir("zp_grabber")))
cached_opener.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3')]
 
####################
# Helper functions #
####################
 
class ZpCacher:
    """
Stores a list of episodes we have FLV URLs for in a text file.
It is used in the following way:
- Load ZP index page
- Grab a list of all video addresses, and grab the video-ID from the URL
- Check if ZpCacher knows about video ID 123
- If it does *NOT* have ID 123, we have to get
the FLV for that ID.
Cache file format is
video_id|flv_url|web_url|title
"""
    def __init__(self):
        self.cache_file = os.path.join(sys.path[0], "zp_cache.txt")
        
        self.cache = {}
        self.load_cache()
 
    def load_cache(self):
        try:
            cur_cache = open(self.cache_file)
            
            for current_line in cur_cache.readlines():
                try:
                    c_vid, c_flv_url, c_web_url, c_title = [x.strip() for x in current_line.split("|")]
                except ValueError:
                    continue # Invalid line, skip
                self.cache[c_vid] = {
                    'flv':c_flv_url,
                    'web':c_web_url,
                    'vid':c_vid,
                    'title':c_title
                }
            
        except IOError:
            pass
        else:
            cur_cache.close()
    
    def add(self,vid,flv_url, web_url, title):
        self.cache[vid] = {
            'flv':flv_url,
            'web':web_url,
            'vid':vid,
            'title':title
        }
        self.save()
    def save(self):
        out = ""
        for vid,values in sorted(self.cache.items(), key=lambda x: int(x[0])):
            out += "%s|%s|%s|%s\n" % (vid, values['flv'], values['web'], values['title'])
        f = open(self.cache_file, "w+")
        f.write(out)
        f.close()
# end ZpCacher
 
 
class error_sitechange(Exception):pass
class error_invalidurl(Exception):pass
class error_connection(Exception):pass
 
class EscapistVideo:
    """
Takes an EscapistMagazine /video/view/... URL, retrieves the URL for the .flv file
# Initialise
t = EscapistMagazine("http://www.escapistmagazine.com/videos/view/zero-punctuation/175-Ninja-Gaiden-2")
# Get the URL
t.get_flv_url()
# Get the video ID
t.get_vid()
Working as of Oct 16, 2009 (flashvar="config=http://" system, and resolve 301 redirect to this URL)
"""
    def __init__(self, url):
        self.url = url
        
    def _parse_escapist_url(self):
        vid_check = re.match("http[s]?://(?:www.)?escapistmagazine.com/videos/view/.+?/([\d]+)-.*?", self.url)
        if vid_check:
            vid = vid_check.groups()[0]
            return vid
        else:
            raise error_invalidurl("%s" % self.url)
    
    def _get_flv_link(self, url, postdata):
        src = cached_opener.open(url, postdata).read()
        if src.find("url=") > -1:
            return urllib.unquote(str( # url decode..
                src.split("url=")[1] # ..the segment after the url=
            ))
        else:
            raise error_sitechange("Couldn't find the FLV url on %s, check it is a valid URL you supplied. If so, the FLV retrival system may have changed!" % (url))
    
    def get_vid(self):
        vid = self._parse_escapist_url()
        return vid
    
    def get_flv_url(self):
        # Check URL
        self._parse_escapist_url()
 
        webp = cached_opener.open(self.url)
        src = webp.read()
        soup = BeautifulSoup(src)
        
        # Extract player from the soup
        vid_player = soup.findAll('div', id="video_player")
        embed = vid_player[0].find('embed')
        config_url = embed['flashvars'].split("config=")[1]
        
        # Got flashvars config path, load it
        config = cached_opener.open(config_url).read()
        
        # Ew. The contents doesn't parse as JSON, so this is necessary
        flv_teller_url = config.split("{'url':'")[2].split("'")[0]
 
        webp = urllib2.urlopen(flv_teller_url)
        return webp.url
 
 
#end EscapistVideo
 
def parse_page_for_videos(zpc, soup):
    """
Takes a BeautifulSoup instance of an escapistmagazine page,
grabs all filmstrip_video div's from the gallery_display div.
From each filmstrip_video div, it grabs the title, and URL.
Using the URL, it grabs the video-ID, checks if the ZpCacher
knows the FLV already, if not, finds the flv url.
Working as of Sep 3, 2009. May break due to page layout changes.
"""
    # counters
    cache_hits = 0
    flv_requests = 0
    
    # Find all div class='filmstrip_video', loop over each one
    video_column = soup.find('div',{'id':'gallery_display'})
    av = video_column.findAll('div', {'class':'filmstrip_video'})
    for cv in av:
        # Get title and the URL
        title = cv.findAll('div',{'class':'title'})[0].contents[0]
        if cv.a['href'].startswith("http://"):
            web_url = cv.a['href']
        else:
            web_url = "http://www.escapistmagazine.com" + cv.a['href']
        
        z = EscapistVideo(web_url)
        vid = z.get_vid()
        
        if zpc.cache.has_key(vid):
            cache_hits += 1
        else:
            # Get the flv URL!
            flv_url = z.get_flv_url()
            flv_requests += 1
            
            zpc.add(vid,flv_url, web_url, title)
    
    return flv_requests, cache_hits
    
def get_recent_zp_videos(get_all = False):
    """
This is the main ZP-grabber function.
It parses the first page of videos (and the rest, if requested)
It gets the page count using the pagination_pages div.
Working as of April 6, 2009. Getting all pages
could break due to layout changes.
"""
    zpc = ZpCacher()
    
    # Load the newest ZP page, into BeautifulSoup
    url="http://www.escapistmagazine.com/videos/view/zero-punctuation"
    webp = cached_opener.open(url)
    src = webp.read()
    src = cached_opener.open(url).read()
    soup = BeautifulSoup(src)
    
    # Always parse first page
    flv_requests, cache_hits = parse_page_for_videos(zpc, soup)
    if get_all:
        for page in soup.findAll('div',{'class':'pagination_pages'})[0].findAll('a'):
            if page.contents[0].isdigit() and int(page.contents[0]) > 1:
                url="http://www.escapistmagazine.com/videos/view/zero-punctuation?page=%d" % (int(page.contents[0]))
                src = cached_opener.open(url).read()
                soup = BeautifulSoup(src)
                fr, ch = parse_page_for_videos(zpc, soup)
                flv_requests += fr
                cache_hits += ch
    
    if flv_requests > 0:
        # FLV's have been requested, display how many (this means new videos were grabbed!)
        print "Parsed %d videos (%d requests, %d cache hits)" % (flv_requests + cache_hits, flv_requests, cache_hits)
    else:
        # If we don't request any new FLV files, stay quiet
        pass
 
def main():
    """
Either parses escapistmagazine.com for ZP episodes, and grabs their flv URL
or
Takes one (or more) ZP episode URLs, returns the flv URL
"""
    parser = OptionParser()
    parser.add_option("-g", "--grab", dest="grab", action="store_true", default="true",
                      help="retrieves and caches flv-links from escapistmagazine.com's ZP page (overrides)")
    parser.add_option("-a", "--all", dest="all", action="store_true",
                      help="retrieves videos from all Zero-Punctuation-list pages")
 
    (options, args) = parser.parse_args()
    
    if options.grab:
        get_recent_zp_videos(options.all)
    else:
        for cur_url in args:
            zpc = ZpCacher()
            
            try:
                z = EscapistVideo(cur_url)
                vid = z.get_vid()
                
                if zpc.cache.has_key(vid):
                    print zpc.cache[vid]['flv']
                else:
                    print z.get_flv_url()
            except error_invalidurl:
                print "Invalid URL?"
                sys.exit(1)
 
if __name__ == '__main__':
    main()