In [1]:
import psycopg as pg
from pgvector.psycopg import register_vector
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import numpy as np
from connections import dbname, user, password, host, port

In [2]:
import timeit
class DatabaseBenchmark:
    """
    Clase para benchmark de postgres
    Falta arreglar las querys para tipo python, decidir tipo de indexacion y ver como funciona con datos reales
    al contrario de chromi, no encuentro vectorizador por default, por lo que se puede utilizar tfidf
    """
    def __init__(self, db_connection, vectorizer, trunc, table = 'embeddings', cost_per_hour = False):
        """
        db_connection: Conexión a base de datos
        vectorizer: text to vector vectorizer
        trunc: trunc svd from sparse matrix to lower dim matrix
        """

        self.db_connection = db_connection
        self.cost_per_hour = cost_per_hour
        self.vectorizer = vectorizer
        self.trunc = trunc
        self.table = table

    def query(self, query_string, n_results = 1):
        """
        query_string: vector de representación de un texto
        n_results: numeros de vecinos a buscar
        """
        query_vector = self.trunc.transform(self.vectorizer.transform([query_string]))[0].tolist()
        
        with self.db_connection.cursor() as cur:
            
            query = "SELECT * FROM {0} ORDER BY embedding <-> '{1}' LIMIT {2};".format(self.table, query_vector, n_results) 
        
            results = cur.execute(query)

            return [record for record in results]  
#       

    def build_index(self, method = 'ivfflat'):
        """
        incompleta, probar con datos
        """
        with self.db_connection.cursor() as cur:
        
            query = 'CREATE INDEX ON {0} USING {1} (embedding vector_l2_ops);'.format(self.table, method)
            
        results = cur.execute(query)
#       

    def measure_qps(self, query_string, num_queries=5, n_results = 1):
        
        start_time = timeit.default_timer()
        for _ in range(num_queries):
            self.query(query_string= query_string, n_results= n_results)
        end_time = timeit.default_timer()
        
        qps = num_queries / (end_time - start_time)
        
        return qps
    
    def measure_qp_dollar(self, qps):
        if self.cost_per_hour is False:
            return "No hay costo por hora"
        qp_dollar = (qps / self.cost_per_hour) * 3600
        return qp_dollar
    
    def measure_latency(self, query_string, ):

        
        start_time = timeit.default_timer()
        self.query(query_string, 1)
        latency = timeit.default_timer() - start_time
        
        return latency
    
    def measure_index_building_time(self):

        
        start_time = timeit.default_timer()
        self.build_index(["Esto es un documento"], {"source": "my_source"}, ["id1"])
        index_building_time = timeit.default_timer() - start_time
        
        return index_building_time

# Database loading and vectorization

In [3]:
textos = pd.read_csv('textos.csv')

In [4]:
vectorizer = TfidfVectorizer()
emb = vectorizer.fit_transform(textos['TEXTO'])

In [5]:
textos = textos.drop(columns = 'TEXTO').reset_index(drop = True)
textos

Unnamed: 0,AUTOR,TITULO
0,Miguel de Cervantes Saavedra,Quijote - Primera Parte
1,Miguel de Cervantes Saavedra,Quijote - Segunda Parte
2,Garcilaso de la Vega,ÉGLOGA PRIMERA
3,Garcilaso de la Vega,ÉGLOGA SEGUNDA
4,Garcilaso de la Vega,ÉGLOGA TERCERA
...,...,...
127,Benito Pérez Galdós,España trágica
128,Benito Pérez Galdós,Amadeo I
129,Benito Pérez Galdós,La Primera República
130,Benito Pérez Galdós,De Cartago a Sagunto


In [6]:
%%time
trunc = TruncatedSVD(128)
embr = trunc.fit_transform(emb)

CPU times: total: 21.2 s
Wall time: 6.46 s


In [7]:
textos['emb'] = [vec.tolist() for vec in embr]
textos['emb']

0      [0.947635730649286, 0.06068554806194847, -0.04...
1      [0.9350762788705504, 0.03510169759942758, -0.0...
2      [0.8349001953267468, 0.18602241097910138, 0.16...
3      [0.878782437213844, 0.14469031439018562, 0.148...
4      [0.8343033820264688, 0.008772285059165993, 0.2...
                             ...                        
127    [0.9688810912619967, -0.12942458429824633, 0.0...
128    [0.9765492402506006, -0.07754434140533913, 0.0...
129    [0.9705626227420445, -0.09337538083696006, 0.0...
130    [0.971511913489722, -0.0938445619699487, 0.027...
131    [0.9668054889319814, -0.10776291344748258, 0.0...
Name: emb, Length: 132, dtype: object

# DB CREATION

In [8]:
conn = pg.connect(dbname= dbname, user= user, password= password, host= host, port= port)
conn.autocommit = True

In [9]:
cursor = conn.cursor()
sql = '''DROP database if exists vector'''
cursor.execute(sql)

sql = '''CREATE database vector'''
cursor.execute(sql)


<psycopg.Cursor [COMMAND_OK] [IDLE] (host=127.0.0.1 database=postgres) at 0x21e9285b950>

In [10]:
conn = pg.connect(dbname= 'vector', user= user, password= password, host= host, port= port)

In [11]:
%%time
conn.execute('CREATE EXTENSION IF NOT EXISTS vector') #la instalación en windows es una lata, instalar VSTUDIO 2022

CPU times: total: 31.2 ms
Wall time: 23.9 ms


<psycopg.Cursor [COMMAND_OK] [INTRANS] (host=127.0.0.1 user=postgres database=vector) at 0x21e992941a0>

In [12]:
register_vector(conn)

In [13]:

table_create_command = """
CREATE TABLE embeddings (
            id SERIAL primary key, 
            author text,
            title text,
            embedding vector(128)
            );
            """
cur = conn.cursor()
cur.execute(table_create_command)
cur.close()
conn.commit()

In [14]:
textos

Unnamed: 0,AUTOR,TITULO,emb
0,Miguel de Cervantes Saavedra,Quijote - Primera Parte,"[0.947635730649286, 0.06068554806194847, -0.04..."
1,Miguel de Cervantes Saavedra,Quijote - Segunda Parte,"[0.9350762788705504, 0.03510169759942758, -0.0..."
2,Garcilaso de la Vega,ÉGLOGA PRIMERA,"[0.8349001953267468, 0.18602241097910138, 0.16..."
3,Garcilaso de la Vega,ÉGLOGA SEGUNDA,"[0.878782437213844, 0.14469031439018562, 0.148..."
4,Garcilaso de la Vega,ÉGLOGA TERCERA,"[0.8343033820264688, 0.008772285059165993, 0.2..."
...,...,...,...
127,Benito Pérez Galdós,España trágica,"[0.9688810912619967, -0.12942458429824633, 0.0..."
128,Benito Pérez Galdós,Amadeo I,"[0.9765492402506006, -0.07754434140533913, 0.0..."
129,Benito Pérez Galdós,La Primera República,"[0.9705626227420445, -0.09337538083696006, 0.0..."
130,Benito Pérez Galdós,De Cartago a Sagunto,"[0.971511913489722, -0.0938445619699487, 0.027..."


In [15]:
%%time
 #agregar datos en pg
register_vector(conn)
cur = conn.cursor()

data_list = [(row['AUTOR'], row['TITULO'], np.array(row['emb'])) for index, row in textos.iterrows()]

cur.executemany("INSERT INTO embeddings (author, title, embedding) VALUES (%s, %s, %s)", data_list)



conn.commit()
cur.close()

CPU times: total: 15.6 ms
Wall time: 16 ms


In [16]:
cur = conn.cursor()
cur.execute('SELECT * FROM embeddings')

<psycopg.Cursor [TUPLES_OK] [INTRANS] (host=127.0.0.1 user=postgres database=vector) at 0x21e9285b740>

In [17]:
cur.close()

# Benchmark

In [18]:
bm = DatabaseBenchmark(conn, vectorizer, trunc)

In [19]:
bm.measure_qps('dog', 100)

25.289514360147024