rha7dotcom / subdivx

Subtitle downloader for http://www.subdivx.com/

This URL has Read+Write access

subdivx / subdivxclient.py
100644 162 lines (127 sloc) 5.83 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python
#
# This file is part of SubDivX.
#
# SubDivX is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License.
#
# SubDivX is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with SubDivX, if not, see <http://www.gnu.org/licenses/>.
#
# See the LICENSE file.
#
 
import re
import urllib
import urllib2
import lxml.html
import os.path
import tempfile
 
class SubDivXClient:
  """
Client for spanish subtitles downloading from http://www.subdivx.com/
 
Examples:
# List of dictionaries of results [ {'title': title, 'link': link, 'description': description, 'cds': cds}, ... ]
subs = SubDivXClient.search('The Matrix').get_results()
# Path to downloaded sutitles archive file, i.e. /tmp/subdivx/32425.rar
filepath = SubDivXClient().search('The Matrix').fecth(1)
"""
  
  def __init__(self):
    self.results = []
    
  def get_results(self):
    """
Returns the results list
[ {'title': title, 'link': link, 'description': description, 'cds': cds}, ... ]
"""
    return self.results
 
  def search(self, str_terms, page):
    """
Makes a search for subtitles.
Returns self for chaining.
str_terms -- The string to search for at subdivx.com
page -- The page number to fetch
"""
    
    # The magic url for searching
    str_url = "http://www.subdivx.com/index.php?buscar=%s&accion=5&masdesc=1&subtitulos=1&realiza_b=1" % urllib.quote(str_terms)
    
    # If not the first page, add the page parameter.
    if page > 1:
      str_url += "&pg=" +str(page)
    
    # Fetch the search results page
    doc = lxml.html.fromstring(unicode(urllib.urlopen(str_url).read(), errors='ignore')).getroottree().getroot()
    
    # Initialize the result elements, separate parallel lists of data
    titles = []
    links = []
    descriptions = []
    cds = []
 
    # Fetch the titles and links
    for link in doc.cssselect("a.titulo_menu_izq"):
      titles.append(link.text_content())
      links.append(link.get('href'))
 
    # Now the descriptions
    for desc in doc.cssselect("div#buscador_detalle_sub"):
      descriptions.append(desc.text_content())
      
    # The number of cds ...
    for dato in doc.cssselect("div#buscador_detalle_sub_datos"):
      cds.append(re.search('Cds:<\/b> ([0-9]+)', lxml.etree.tostring(dato)).group(1))
      
    # We will consolidate the above results in a list of dictionaries in the results list
    self.results = []
    for i in range(len(titles)):
      self.results.append({ 'title': titles[i], 'link': links[i], 'description': descriptions[i], 'cds': cds[i] })
 
    # And return self, for chaining, see SubDivXClient class doc.
    return self
    
  def fetch(self, int_num):
    """
Fetches the selected result from results (which you fill up by calling search()),
and returns the path to the downloaded file stored at {SYSTEM_TEMP_DIR}/subdivx/
You MUST call search first.
int_num -- The results index number, starting from 0 (zero), from results array
"""
    
    # Create a request object for the result page with the captcha form for the selected result.
    # From here we will get the 'desub' and 'u' paramaters needed to make the request for
    # the actual file.
    # Must make a req. first, because we need to add a Referer header.
    req = urllib2.Request(self.results[int_num]['link'])
    req.add_header('Referer', self.results[int_num]['link'])
    
    # Fetch the above request and parse it.
    doc = lxml.html.fromstring(unicode(urllib2.urlopen(req).read(), errors='ignore')).getroottree().getroot()
    
    # Get the 'desub' and 'u' form parameters
    desub = doc.cssselect("input[type='hidden'][name='desub']")[0].get('value')
    u = doc.cssselect("input[type='hidden'][name='u']")[0].get('value')
 
    # Now build a second request to fetch the actual file, as before we need a req. object
    # to add the Referer header, now the referer is the above fetched url.
    req = urllib2.Request("http://www.subdivx.com/bajar.php?captcha_user=ME2&idcaptcha=1007&desub=%s&u=%s" % (desub, u))
    req.add_header('Referer', self.results[int_num]['link'])
    
    # And open the subtitle actual file url
    subs = urllib2.urlopen(req)
    
    # Get the content-disposition header to obtain the file name
    # subdivx.com returns the subtitles as rars and zips, so we
    # need to know the name of the file.
    fn = subs.info().get('content-disposition').split(';')[1].split('=')[1]
    
    # Create the output temporary directory 'subdivx' below the system's
    # temporary directory. If it doesn't exist, create it.
    out_dir = os.path.join(tempfile.gettempdir(), 'subdivx')
    if os.path.exists(out_dir):
      if os.path.isdir(out_dir):
        pass
      else:
        raise IOError("Temp file subdivx couldn't be created or already exists as a file or with wrong permissions")
    else:
      os.makedirs(out_dir)
      
    # Now we know we have the subdirectory to hold the file, generate the file name
    # using the above temporary directory and the file name we got from the content-disposition
    # header
    out_file = os.path.join(out_dir, fn)
 
    # Open in binary write mode the output file, and write to it the data
    # read from the url.
    of = open(out_file, 'wb')
    of.write(subs.read())
    
    # Clean up our mess
    of.close()
    subs.close()
    
    # And return the saved subtitle file path.
    return out_file