-
Notifications
You must be signed in to change notification settings - Fork 0
/
export-goodreads.py
96 lines (79 loc) · 2.89 KB
/
export-goodreads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Export GoodReads-specific data from the book data tools.
Usage:
export.py --book-ids
export.py --work-ratings [--implicit]
"""
from docopt import docopt
import pandas as pd
from bookgender.logutils import start_script
from bookgender.config import data_dir, db_uri
import bookgender.datatools as dt
_log = start_script(__file__)
def export_book_ids():
query = f'''
SELECT gr_book_rid, gr_book_id, gr_work_id, cluster
FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
ORDER BY gr_book_rid
'''
_log.info('reading book IDs')
books = pd.read_sql(query, db_uri())
csv_fn = data_dir / 'gr-book-ids.csv.gz'
pq_fn = data_dir / 'gr-book-ids.parquet'
_log.info('writing CSV to %s', csv_fn)
books.to_csv(csv_fn, index=False)
_log.info('writing parquet to %s', pq_fn)
books.to_parquet(pq_fn, index=False, compression='gzip')
def export_work_actions():
path = data_dir / 'GR-I' / 'work-ratings.parquet'
query = f'''
SELECT gr_user_rid AS user_id,
COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS book_id,
COUNT(rating) AS nactions,
MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time,
MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time
FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
ORDER BY MIN(date_updated)
'''
_log.info('reading book shelf actions')
actions = dt.load_table(query, dtype={
'user': 'i4',
'item': 'i4',
'nactions': 'i4'
})
path.parent.mkdir(parents=True, exist_ok=True)
_log.info('writing ratings to %s', path)
actions.to_parquet(path, index=False)
def export_work_ratings():
path = data_dir / 'GR-E' / 'work-ratings.parquet'
query = f'''
SELECT gr_user_rid AS user_id,
COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS book_id,
MEDIAN(rating) AS rating,
(array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating,
MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp,
COUNT(rating) AS nratings
FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
WHERE rating > 0
GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
ORDER BY MIN(date_updated)
'''
_log.info('reading book ratings')
ratings = dt.load_table(query, dtype={
'user': 'i4',
'item': 'i4',
'rating': 'f4',
'nactions': 'i4'
})
path.parent.mkdir(parents=True, exist_ok=True)
_log.info('writing ratings to %s', path)
ratings.to_parquet(path, index=False)
args = docopt(__doc__)
if args['--book-ids']:
export_book_ids()
if args['--work-ratings']:
if args['--implicit']:
export_work_actions()
else:
export_work_ratings()