In [1]:
import gzip
import json
import re
import os
import sys
import time
import numpy as np
import pandas as pd
from itertools import islice
import psycopg2
from psycopg2.extras import execute_values
pd.options.display.float_format = '{:,}'.format

def batched(iterable, n):
    "Batch data into tuples of length n. The last batch may be shorter."
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while (batch := tuple(islice(it, n))):
        yield batch

# convert string to int or none
def parseInt(string):
    return int(float(string)) if string else None

In [2]:
con = psycopg2.connect(
    host="localhost",
    database="reviews",
    user="postgres",
    password="postgres"
    )
cur = con.cursor()

con_lite = psycopg2.connect(
    host="localhost",
    database="reviews_lite",
    user="postgres",
    password="postgres"
    )
con_lite = psycopg2.connect(os.environ['NEON_URL'])
cur_lite = con_lite.cursor()

In [None]:
cur_lite.execute('select id from review_embed')
review_ids = cur_lite.fetchall()

for review_id in review_ids:
    cur.execute("select * from review where id = %s", review_id)
    review = cur.fetchone()
    cur.execute('select * from book where id = %s' % review[1])
    book = cur.fetchone()

    cur.execute('select id, name from author left join authorForBook on author.id = authorForBook.authorId where bookId = %s' % book[0])
    authors = cur.fetchall()

    cur.execute('select * from authorForBook where bookId = %s' % book[0])
    authorsForBook = cur.fetchall()

    cur.execute('select * from work where id = %s' % book[1])
    work = cur.fetchone()

    cur_lite.execute('select * from work where id = %s' % work[0])
    work_lite = cur_lite.fetchone()

    if work_lite is None:
        cur_lite.execute('insert into work values (%s, %s, %s)', work)

    cur_lite.execute('select * from book where id = %s' % book[0])
    book_lite = cur_lite.fetchone()

    if book_lite is None:
        cur_lite.execute('insert into book values (%s, %s, %s, %s, %s, %s, %s, %s)', book)

        if book[7] and work[1] == '':
            cur_lite.execute('update work set title = %s where id = %s', (book[3], work[0]))

        for author in authors:
            cur_lite.execute('select * from author where id = %s' % author[0])
            author_lite = cur_lite.fetchone()

            if author_lite is None:
                cur_lite.execute('insert into author values (%s, %s)', author)

        for authorForBook in authorsForBook:
            cur_lite.execute('insert into authorForBook values (%s, %s, %s)', authorForBook)
con_lite.commit()

In [None]:
cur_lite.execute('select id from review_embed')
review_ids = cur_lite.fetchall()
for review_id in review_ids:

    cur.execute("select * from review where id = %s", review_id)
    review = cur.fetchone()
    cur.execute('select * from book where id = %s' % review[1])
    book = cur.fetchone()

    cur_lite.execute('insert into review values (%s, %s)', (review_id, book[0]))

con_lite.commit()

In [None]:
cur_lite.execute('alter table book add column image text')
cur_lite.execute('alter table book add column description text')
con_lite.commit()

In [None]:
cur_lite.execute('select id from book')
book_ids = cur_lite.fetchall()
for book_id in book_ids:
    cur.execute('select image, description from book where id = %s', book_id)
    book = cur.fetchone()
    cur_lite.execute('update book set image = %s, description = %s where id = %s', (book[0], book[1], book_id))
con_lite.commit()

In [None]:
from sentence_transformers import SentenceTransformer
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

In [None]:
query_embedding = model.encode('interesting worlds in space')
cur_lite.execute('''
            with constants (query_embedding) as (
                values ('%s'::vector(384))
            )
            select res.title, count(*), sum(distance), res.url from (
                select work.id, work.title, 1 - (embedding <=> query_embedding) as distance, b.url
                from constants, review_embed
                left join review on review_embed.id = review.id
                left join book on review.bookId = book.id
                left join work on book.workId = work.id
                left join book as b on work.id = b.workId and b.bestOfWork = true
                order by embedding <=> query_embedding
                limit 1000) as res
            group by res.id, res.title, res.url
            order by sum(distance) desc
            limit 20
''' % query_embedding.tolist())

df = pd.DataFrame(cur_lite.fetchall())
df

In [None]:
query_embedding = model.encode("F-ing brilliant. The storytelling...the images...I just can't get over it. And I loved the guest appearances by Chaucer and Shakespeare!")
cur_lite.execute('''
                    with constants (query_embedding) as (
                        values ('%s'::vector(384))
                    )
                    select res.title, res.description, res.url, res.image, res.year, res.name, res.role from (
                        select work.id, work.title, 1 - (embedding <=> query_embedding) as distance, b.description, b.url, b.image, work.year, author.name, authorForBook.role
                        from constants, review_embed
                        left join review on review_embed.id = review.id
                        left join book on review.bookId = book.id
                        left join work on book.workId = work.id
                        left join book as b on work.id = b.workId and b.bestOfWork = true
                        left join authorForBook on b.id = authorForBook.bookId
                        left join author on authorForBook.authorId = author.id
                        order by embedding <=> query_embedding
                        limit 1000) as res
                    group by res.id, res.title, res.description, res.url, res.image, res.year, res.name, res.role
                    order by sum(distance) desc
                    limit 5;
''' % query_embedding.tolist())
                 
df = pd.DataFrame(cur_lite.fetchall())
df

In [None]:
query_embedding = model.encode("I had no expectations heading in to this book, it just caught my eye at the library a couple of weeks back. I'm glad it did because it was a heartwarming story with likeable characters. The touch of fantasy and a cute dog helped a lot, too. I felt the religious turnaround was a little too easy/casual, but it made for a good read, so I'll go with it. ")
cur_lite.execute('''
            with constants (query_embedding) as (
                values ('%s'::vector(384))
            )
            select work.id, work.title, book.id, 1 - (embedding <=> query_embedding) as distance
            from constants, review_embed
            inner join review on review_embed.id = review.id
            inner join book on review.bookId = book.id
            inner join work on book.workId = work.id
            order by embedding <=> query_embedding
            limit 1
''' % query_embedding.tolist())

df = pd.DataFrame(cur_lite.fetchall())
df

In [None]:
con_lite.commit()

# Set an author string for each book

In [None]:
cur_lite.execute('select id, bestOfWork from book')
book_ids = cur_lite.fetchall()
for (book_id, best) in book_ids:
    if not best: continue
    cur_lite.execute('select name, role from author left join authorForBook on author.id = authorForBook.authorId where bookId = %s' % book_id)
    authors = cur_lite.fetchall()
    authors = sorted(authors, key=lambda x: x[1])
    author_string = ', '.join([f'{author[0]}' + (f' ({author[1]})' if author[1] != '' else '') for author in authors])
    if len(authors) > 1:
        print(len(authors), author_string)
    cur_lite.execute('update book set authors = %s where id = %s', (author_string, book_id))


In [7]:

con_lite.commit()

# Set series for works

In [13]:
cur_lite.execute('select id, title from work')
works = cur_lite.fetchall()

# regex matching text within brackets, if it contains a hash
pattern = re.compile(r'\(([^)]*#[^)]*)\)')

for (work_id, title) in works:
    match = pattern.search(title)
    if match:
        title = title.replace(match.group(0), '')
        cur_lite.execute('update work set title = %s, series = %s where id = %s', (title, match.group(0)[1:-1], work_id))
        print(work_id, title, match.group(0)[1:-1])
con_lite.commit()

6674837 Clockwork Angel  The Infernal Devices, #1
19101555 Three Parts Dead  Craft Sequence, #1
26628578 The Atlantis Plague  The Origin Mystery, #2
2226612 In the Ocean of Night  Galactic Center, #1
760678 Secrets of a Summer Night  Wallflowers, #1
24064584 The Rogue's Proposal  House of Trent, #2
39907020 Vicious Cycle  Vicious Cycle, #1
2146233 Monday Mourning  Temperance Brennan, #7
52265970 Joseph: The Other Father  Intrepid Men of God #5
16813814 Fifty Shades Darker  Fifty Shades, #2
42246829 Chayot  Secrets and Sins, #4
6877681 Powerful Greek, Unworldly Wife  Innocent Wives #1
19256196 Seeking Pack Redemption  Pack, #3
40901308 Bender  The Core Four, #1
25332937 Night of the Hunter  Companions Codex, #1; Legend of Drizzt, #25
44486610 Savage Fire  Savage Angels MC, #2
15830154 The Hunt  The Hunt, #1
42713958 The Dark Forest  Remembrance of Earth’s Past, #2
6838626 The Iron King  The Iron Fey, #1
6171458 Catching Fire  The Hunger Games, #2
14533180 Confessions of an Improper Brid