lethain / bossarray

A Python list like wrapper around the Yahoo BOSS search results.

This URL has Read+Write access

bossarray / boss_array.py
9513cf96 » lethain 2008-07-27 First commit. Uses linear s... 1 from yos.boss import ysearch
2 from yos.yql import db
3
4 class BossArray(object):
5
6 def __init__(self, term):
7 self.term = term
8 self.retrieved = []
9 self.length = None
10
11 def _cache(self, start, results,sort=True):
12 i = start
13 for result in results:
14 self.retrieved.append((i,result))
15 i = i + 1
16 if sort is True:
17 self.retrieved.sort()
18
19 def _cached_portion(self, start, count):
20 def contains_result(n):
21 for pos,val in self.retrieved:
22 if pos == n:
23 return val
24 return None
25 cached = []
26 not_cached = []
27 for i in xrange(start,start+count):
28 val = contains_result(i)
29 if val is not None:
30 cached.append((i,val))
31 else:
32 not_cached.append(i)
33 return cached,not_cached
34
35 def _download(self, start, count, sort=True):
36 rows = []
37 length = 0
38 for i in xrange(0, count, 50):
39 offset = i * 50
40 pos = start + offset
41 num_results = min(count - offset, 50)
42 data = ysearch.search(self.term,start=pos,count=num_results)
43 length = data['ysearchresponse']['totalhits']
44 rows = rows + db.create(data=data).rows
45
46 self.length = int(length)
47 self._cache(start,rows,sort=sort)
48 return rows
49
50 def __getitem__(self, i):
51 if type(i) == int:
f0e217f0 » lethain 2008-07-27 Correctly caching retrieved... 52 cached, not_cached = self._cached_portion(i,1)
9513cf96 » lethain 2008-07-27 First commit. Uses linear s... 53 if cached == []:
54 return self._download(i,1)[0]
55 else:
56 return cached[0][1] # [(index,value),]
57 else:
58 cached, not_cached = self._cached_portion(i.start,i.stop-i.start)
59 length = len(not_cached)
60 if length == 0:
61 return tuple(x[1] for x in cached)
62 elif length == 1:
63 index = not_cached[0]
64 downloaded = [(index,self._download(index,1),)]
65 else:
66 runs = []
67 length = len(not_cached)
541b9eb8 » lethain 2008-07-27 Correctly determining uncac... 68 start_of_run = not_cached[0]
69 prev = start_of_run
70 for i in xrange(1,length):
71 val = not_cached[i]
72 if val - prev == 1:
73 if i != length-1:
74 pass
75 else:
76 runs.append((start_of_run,1+val-start_of_run))
77 else:
78 runs.append((start_of_run,val-1-start_of_run))
79 if i < length - 1:
80 start_of_run = not_cached[i]
81 prev = val
e0eb00a5 » lethain 2008-07-28 Correctly caching and reusi... 82 # First we break apart runs into
83 # groups of 50, since that is the
84 # maximum allowable number of results
85 # per request.
86 legal_runs = []
541b9eb8 » lethain 2008-07-27 Correctly determining uncac... 87 for index,count in runs:
e0eb00a5 » lethain 2008-07-28 Correctly caching and reusi... 88 if count <= 50:
89 legal_runs.append((index,count))
90 else:
91 num_runs = count / 50
92 if count % 50 > 0:
93 num_runs = num_runs + 1
94 for i in xrange(0,num_runs):
95 offset = i * 50
96 tmp_count = min(count - offset,50)
97 legal_runs.append((index+offset,tmp_count))
98 downloaded = []
99 for index,count in legal_runs:
2a36e929 » lethain 2008-07-28 Fixed typo. 100 tuples = zip(xrange(index,index+count),self._download(index,count,sort=False))
f0e217f0 » lethain 2008-07-27 Correctly caching retrieved... 101 downloaded = downloaded + tuples
e0eb00a5 » lethain 2008-07-28 Correctly caching and reusi... 102 # We turned off sorting while downloading the runs, since
103 # we knew we'd be adding a lot of data at once and it would
104 # be easier to only sort all the additional data once.
105 self.retrieved.sort()
541b9eb8 » lethain 2008-07-27 Correctly determining uncac... 106
f0e217f0 » lethain 2008-07-27 Correctly caching retrieved... 107 cached = cached + downloaded
9513cf96 » lethain 2008-07-27 First commit. Uses linear s... 108 cached.sort()
109 return tuple(x[1] for x in cached)
110
111 def __len__(self):
112 if self.length is None:
113 self._download(0,1)
114 return self.length
115
116