/
querycouch.py
78 lines (57 loc) · 2.6 KB
/
querycouch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import sys
import getopt
import os
import csv
import cPickle
import gzip
import theano
import time, PIL.Image
import couchdb
from csv_unicode_helpers import UnicodeWriter
from struct import *
from numpy import *
from couchdb.mapping import Document, LongField, DateField, FloatField, TextField, IntegerField, BooleanField
from couchdb import Server
class QueryCouch(object):
def __init__(self, server_url, database):
couch = Server(server_url)
self.db = couch[database]
def tokens_overall(self):
view = self.db.view('research/tokens', reduce=True, group_level=2)
results = []
for row in view:
results.append((row.value,row.key[0], row.key[1]))
results.sort()
results.reverse()
return results
def tokens_overall_by_type(self,type):
view = self.db.view('research/tokens', reduce=True, group_level=2, startkey=[type], endkey=[type + ' '])
print 'getting tokens of type ', type
results = []
for row in view:
results.append((row.value, row.key[1]))
results.sort()
results.reverse()
return results
def top_tokens_by_date_type(self, type):
top_tokens = self.tokens_overall_by_type(type)
results = []
for (overall_count, token) in top_tokens[0:1000]:
view = self.db.view('research/tokens', reduce=True, group_level=3, startkey=[type, token], endkey=[type, token +' '])
for row in view:
results.append((row.key[0], row.key[1], row.key[2], row.value, overall_count))
return results
def count_tweets_by_date(self):
view = self.db.view('research/all', reduce=True, group_level=1)
print 'getting tweets by date count '
results = []
for row in view:
results.append((row.key[0], row.value))
return results
def count_tweets_by_hour(self):
view = self.db.view('research/all', reduce=True, group_level=2)
print 'getting tweets by hour count'
results = []
for row in view:
results.append((row.key[0], row.key[1], row.value))
return results