Skip to content

Commit

Permalink
feat: support smlar extension
Browse files Browse the repository at this point in the history
  • Loading branch information
mrdrivingduck committed Oct 16, 2023
1 parent 9aba484 commit 4c5827f
Show file tree
Hide file tree
Showing 91 changed files with 18,129 additions and 8 deletions.
1 change: 1 addition & 0 deletions external/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ SUBDIRS += polar_parameter_check
SUBDIRS += polar_stat_sql
SUBDIRS += polar_csn
SUBDIRS += polar_px
SUBDIRS += smlar
endif #enable_polar_minimal
# Common end

Expand Down
4 changes: 4 additions & 0 deletions external/smlar/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Generated subdirectories
/log/
/results/
/tmp_check/
26 changes: 26 additions & 0 deletions external/smlar/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

MODULE_big = smlar
OBJS = smlar.o smlar_gist.o smlar_gin.o smlar_cache.o \
tsarr.o smlar_guc.o smlar_stat.o

EXTENSION = smlar
DATA = smlar--1.0.sql smlar--unpackaged--1.0.sql
REGRESS = smlar int2 int4 int8 float4 float8 money oid \
timestamp timestamptz time timetz date interval \
macaddr inet cidr \
text varchar char bytea bit varbit numeric \
int4g int8g intervalg textg \
int4i int8i intervali texti \
composite_int4 composite_text

ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = external/smlar
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

116 changes: 116 additions & 0 deletions external/smlar/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
float4 smlar(anyarray, anyarray)
- computes similary of two arrays. Arrays should be the same type.

float4 smlar(anyarray, anyarray, bool useIntersect)
- computes similary of two arrays of composite types. Composite type looks like:
CREATE TYPE type_name AS (element_name anytype, weight_name FLOAT4);
useIntersect option points to use only intersected elements in denominator
see an exmaples in sql/composite_int4.sql or sql/composite_text.sql

float4 smlar( anyarray a, anyarray b, text formula );
- computes similary of two arrays by given formula, arrays should
be the same type.
Predefined variables in formula:
N.i - number of common elements in both array (intersection)
N.a - number of uniqueelements in first array
N.b - number of uniqueelements in second array
Example:
smlar('{1,4,6}'::int[], '{5,4,6}' )
smlar('{1,4,6}'::int[], '{5,4,6}', 'N.i / sqrt(N.a * N.b)' )
That calls are equivalent.

anyarray % anyarray
- returns true if similarity of that arrays is greater than limit

float4 show_smlar_limit() - deprecated
- shows the limit for % operation

float4 set_smlar_limit(float4) - deprecated
- sets the limit for % operation

Use instead of show_smlar_limit/set_smlar_limit GUC variable
smlar.threshold (see below)


text[] tsvector2textarray(tsvector)
- transforms tsvector type to text array

anyarray array_unique(anyarray)
- sort and unique array

float4 inarray(anyarray, anyelement)
- returns zero if second argument does not present in a first one
and 1.0 in opposite case

float4 inarray(anyarray, anyelement, float4, float4)
- returns fourth argument if second argument does not present in
a first one and third argument in opposite case

GUC configuration variables:

smlar.threshold FLOAT
Array's with similarity lower than threshold are not similar
by % operation

smlar.persistent_cache BOOL
Cache of global stat is stored in transaction-independent memory

smlar.type STRING
Type of similarity formula: cosine(default), tfidf, overlap

smlar.stattable STRING
Name of table stored set-wide statistic. Table should be
defined as
CREATE TABLE table_name (
value data_type UNIQUE,
ndoc int4 (or bigint) NOT NULL CHECK (ndoc>0)
);
And row with null value means total number of documents.
See an examples in sql/*g.sql files
Note: used on for smlar.type = 'tfidf'

smlar.tf_method STRING
Calculation method for term frequency. Values:
"n" - simple counting of entries (default)
"log" - 1 + log(n)
"const" - TF is equal to 1
Note: used on for smlar.type = 'tfidf'

smlar.idf_plus_one BOOL
If false (default), calculate idf as log(d/df),
if true - as log(1+d/df)
Note: used on for smlar.type = 'tfidf'

Module provides several GUC variables smlar.threshold, it's highly
recommended to add to postgesql.conf:
custom_variable_classes = 'smlar' # list of custom variable class names
smlar.threshold = 0.6 #or any other value > 0 and < 1
and other smlar.* variables

GiST/GIN support for % and && operations for:
Array Type | GIN operator class | GiST operator class
---------------+----------------------+----------------------
bit[] | _bit_sml_ops |
bytea[] | _bytea_sml_ops | _bytea_sml_ops
char[] | _char_sml_ops | _char_sml_ops
cidr[] | _cidr_sml_ops | _cidr_sml_ops
date[] | _date_sml_ops | _date_sml_ops
float4[] | _float4_sml_ops | _float4_sml_ops
float8[] | _float8_sml_ops | _float8_sml_ops
inet[] | _inet_sml_ops | _inet_sml_ops
int2[] | _int2_sml_ops | _int2_sml_ops
int4[] | _int4_sml_ops | _int4_sml_ops
int8[] | _int8_sml_ops | _int8_sml_ops
interval[] | _interval_sml_ops | _interval_sml_ops
macaddr[] | _macaddr_sml_ops | _macaddr_sml_ops
money[] | _money_sml_ops |
numeric[] | _numeric_sml_ops | _numeric_sml_ops
oid[] | _oid_sml_ops | _oid_sml_ops
text[] | _text_sml_ops | _text_sml_ops
time[] | _time_sml_ops | _time_sml_ops
timestamp[] | _timestamp_sml_ops | _timestamp_sml_ops
timestamptz[] | _timestamptz_sml_ops | _timestamptz_sml_ops
timetz[] | _timetz_sml_ops | _timetz_sml_ops
varbit[] | _varbit_sml_ops |
varchar[] | _varchar_sml_ops | _varchar_sml_ops

105 changes: 105 additions & 0 deletions external/smlar/expected/bit.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
set extra_float_digits =0;
SELECT set_smlar_limit(0.6);
set_smlar_limit
-----------------
0.6
(1 row)

SELECT
t,
ARRAY(
SELECT
v::int4::bit(10)
FROM
generate_series(1, t) as v
) AS v
INTO test_bit
FROM
generate_series(1, 200) as t;
SELECT t, smlar(v, '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t;
t | s
----+----------
10 | 1
11 | 0.953463
9 | 0.948683
12 | 0.912871
8 | 0.894427
13 | 0.877058
14 | 0.845154
7 | 0.83666
15 | 0.816497
16 | 0.790569
6 | 0.774597
17 | 0.766965
18 | 0.745356
19 | 0.725476
5 | 0.707107
20 | 0.707107
21 | 0.690066
22 | 0.6742
23 | 0.65938
24 | 0.645497
4 | 0.632456
25 | 0.632456
26 | 0.620174
27 | 0.608581
(24 rows)

SELECT t, smlar(v, '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t;
t | s
----+----------
8 | 0.782624
9 | 0.737865
7 | 0.717137
10 | 0.7
11 | 0.667424
6 | 0.645497
12 | 0.63901
13 | 0.613941
(8 rows)

CREATE INDEX idx_test_bit ON test_bit USING gin (v _bit_sml_ops);
SET enable_seqscan=off;
SELECT t, smlar(v, '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{10,9,8,7,6,5,4,3,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t;
t | s
----+----------
10 | 1
11 | 0.953463
9 | 0.948683
12 | 0.912871
8 | 0.894427
13 | 0.877058
14 | 0.845154
7 | 0.83666
15 | 0.816497
16 | 0.790569
6 | 0.774597
17 | 0.766965
18 | 0.745356
19 | 0.725476
5 | 0.707107
20 | 0.707107
21 | 0.690066
22 | 0.6742
23 | 0.65938
24 | 0.645497
4 | 0.632456
25 | 0.632456
26 | 0.620174
27 | 0.608581
(24 rows)

SELECT t, smlar(v, '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[]) AS s FROM test_bit WHERE v % '{50,49,8,7,6,5,4,33,2,1}'::int4[]::bit(10)[] ORDER BY s DESC, t;
t | s
----+----------
8 | 0.782624
9 | 0.737865
7 | 0.717137
10 | 0.7
11 | 0.667424
6 | 0.645497
12 | 0.63901
13 | 0.613941
(8 rows)

SET enable_seqscan=on;
Loading

0 comments on commit 4c5827f

Please sign in to comment.