In [1]:
import gzip
import json
import re
import os
import sys
import time
import numpy as np
import pandas as pd
from itertools import islice
import psycopg2
from psycopg2.extras import execute_values
pd.options.display.float_format = '{:,}'.format

def batched(iterable, n):
    "Batch data into tuples of length n. The last batch may be shorter."
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while (batch := tuple(islice(it, n))):
        yield batch

# convert string to int or none
def parseInt(string):
    return int(float(string)) if string else None

In [2]:
con = psycopg2.connect(
    host="localhost",
    database="reviews",
    user="postgres",
    password="postgres"
    )
cur = con.cursor()

con_lite = psycopg2.connect(
    host="localhost",
    database="reviews_lite",
    user="postgres",
    password="postgres"
    )
cur_lite = con_lite.cursor()

In [None]:
cur_lite.execute('select id from review_embed')
review_ids = cur_lite.fetchall()

for review_id in review_ids:
    cur.execute("select * from review where id = %s", review_id)
    review = cur.fetchone()
    cur.execute('select * from book where id = %s' % review[1])
    book = cur.fetchone()

    cur.execute('select id, name from author left join authorForBook on author.id = authorForBook.authorId where bookId = %s' % book[0])
    authors = cur.fetchall()

    cur.execute('select * from authorForBook where bookId = %s' % book[0])
    authorsForBook = cur.fetchall()

    cur.execute('select * from work where id = %s' % book[1])
    work = cur.fetchone()

    cur_lite.execute('select * from work where id = %s' % work[0])
    work_lite = cur_lite.fetchone()

    if work_lite is None:
        cur_lite.execute('insert into work values (%s, %s, %s)', work)

    cur_lite.execute('select * from book where id = %s' % book[0])
    book_lite = cur_lite.fetchone()

    if book_lite is None:
        cur_lite.execute('insert into book values (%s, %s, %s, %s, %s, %s, %s, %s)', book)

        if book[7] and work[1] == '':
            cur_lite.execute('update work set title = %s where id = %s', (book[3], work[0]))

        for author in authors:
            cur_lite.execute('select * from author where id = %s' % author[0])
            author_lite = cur_lite.fetchone()

            if author_lite is None:
                cur_lite.execute('insert into author values (%s, %s)', author)

        for authorForBook in authorsForBook:
            cur_lite.execute('insert into authorForBook values (%s, %s, %s)', authorForBook)
con_lite.commit()

In [None]:
cur_lite.execute('select id from review_embed')
review_ids = cur_lite.fetchall()
for review_id in review_ids:

    cur.execute("select * from review where id = %s", review_id)
    review = cur.fetchone()
    cur.execute('select * from book where id = %s' % review[1])
    book = cur.fetchone()

    cur_lite.execute('insert into review values (%s, %s)', (review_id, book[0]))

con_lite.commit()

In [3]:
from sentence_transformers import SentenceTransformer
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [128]:
query_embedding = model.encode('interesting worlds in space')
cur_lite.execute('''
            with constants (query_embedding) as (
                values ('%s'::vector(384))
            )
            select res.title, count(*), sum(distance) from (
                select work.id, work.title, 1 - (embedding <=> query_embedding) as distance
                from constants, review_embed
                inner join review on review_embed.id = review.id
                inner join book on review.bookId = book.id
                inner join work on book.workId = work.id
                order by embedding <=> query_embedding
                limit 1000) as res
            group by res.id, res.title
            order by sum(distance) desc
            limit 20
''' % query_embedding.tolist())

df = pd.DataFrame(cur_lite.fetchall())
df

Unnamed: 0,0,1,2
0,"The Long Way to a Small, Angry Planet (Wayfare...",39,15.364284122947282
1,"Leviathan Wakes (The Expanse, #1)",36,14.031804393631887
2,The Three-Body Problem (Remembrance of Earth’s...,25,9.42056230942164
3,Dark Matter,24,9.343772217497747
4,A Short History of Nearly Everything,23,8.939953480070695
5,Seveneves,22,8.627525400744462
6,"Old Man's War (Old Man's War, #1)",19,7.399221776691522
7,"Across the Universe (Across the Universe, #1)",18,7.164454248685684
8,"Hyperion (Hyperion Cantos, #1)",17,6.522570789973612
9,Foundation (Foundation #1),15,5.708442517942179


In [109]:
query_embedding = model.encode("I had no expectations heading in to this book, it just caught my eye at the library a couple of weeks back. I'm glad it did because it was a heartwarming story with likeable characters. The touch of fantasy and a cute dog helped a lot, too. I felt the religious turnaround was a little too easy/casual, but it made for a good read, so I'll go with it. ")
cur_lite.execute('''
            with constants (query_embedding) as (
                values ('%s'::vector(384))
            )
            select work.id, work.title, book.id, 1 - (embedding <=> query_embedding) as distance
            from constants, review_embed
            inner join review on review_embed.id = review.id
            inner join book on review.bookId = book.id
            inner join work on book.workId = work.id
            order by embedding <=> query_embedding
            limit 1
''' % query_embedding.tolist())

df = pd.DataFrame(cur_lite.fetchall())
df

Unnamed: 0,0,1,2,3
0,47207032,The Dog That Whispered,27170158,1.0


In [107]:
con_lite.commit()