In [19]:
from datasets import load_dataset
from unisim import TextSim
from tabulate import tabulate
import pandas as pd

## Load Dataset

For this example, we use a product matching dataset 

https://huggingface.co/datasets/RUC-DataLab/ER-dataset

https://github.com/ruc-datalab/DADER

There are additional datasets that test entity retrieval and matching

citation, product matching, restaurant matching

In [20]:
dataset = load_dataset("RUC-DataLab/ER-dataset", data_files="dblp_scholar.csv", split="train")



In [21]:
# restaurants, products are good to do like differences in matching addresses etc easy to do 

# restaurants1 not bad, better than fodors zagats which gets 100 percent
# beer is pretty good
# walmart amazon not bad
# itunes_amazon is meh maybe too easy
# dblp_sholcar and acm decent might be too easy , scholar is harder than ACM which is like 99% accuracy
# restauratnts 3 and 4 are decent but might be too easy
# fodors zagats too aesy

# not good because book editions books 4 > book 2
# anime not easy either its like the diference is saesons 
# cosmetics not good either, it's like different colors are not clasified as the same thing
# abt_buy not that good, missing descriptions and not super clean it seems 
# movies1 not good year range is kinda weird/ 
# shoes and comuters not great, there are near-dups that are marked as not the same item 
# ebboks not good, based on similarity description similarity

In [22]:
dataset[:5]

{'A_title': ['the demarcation protocol : a technique for maintaining constraints in distributed database systems ',
  'on-demand data elevation in hierarchical multimedia storage servers ',
  'database tuning : principles , experiments , and troubleshooting techniques ',
  'dna-miner : a system prototype for mining dna sequences ',
  'communication efficient distributed mining of association rules '],
 'A_authors': ['d barbarс , h garcia-molina ',
  'p triantafillou , t papadakis ',
  'd shasha , p bonnet ',
  'j han , h jamil , y lu , l chen , y liao , j pei ',
  'a schuster , r wolff '],
 'A_venue': ['vldb j. ',
  'vldb ',
  'vldb ',
  'sigmod conference ',
  'sigmod conference '],
 'A_year': [1994, 1997, 2002, 2001, 2001],
 'B_title': ['local verification of global integrity constraints in distributed databases ',
  'on-demand data elevation in a hierarchical multimedia storage server ',
  'database tuning : principles , experiments , and troubleshooting techniques ( part i ) ',
  '

In [23]:
print("Size of dataset:", len(dataset))

dataset_features = list(dataset.features.keys())
print("Dataset features:", dataset_features)

Size of dataset: 28707
Dataset features: ['A_title', 'A_authors', 'A_venue', 'A_year', 'B_title', 'B_authors', 'B_venue', 'B_year', 'label']


In [24]:
text1_features = [x for x in dataset_features if x.startswith("A")]
text2_features = [x for x in dataset_features if x.startswith("B")]
is_match_feature = "label"

def get_text_pair(idx):
    ex = dataset[idx]

    text1 = " ".join(str(ex[x]) for x in text1_features)
    text2 = " ".join(str(ex[x]) for x in text2_features)

    label = ex[is_match_feature]
    return [text1, text2, label]

### TextSim

In [25]:
text_sim = TextSim(index_type="exact")
text_sim.similarity("this is a text", "apples") 



INFO: UniSim is storing a copy of the indexed data
INFO: If you are using large data corpus, consider disabling this behavior using store_data=False


0.4220390021800995

### Computing Similarity between Texts

In [26]:
text_sim.similarity("this is an example text", "This is an example txt! 😀 ")

0.9418152570724487

In [27]:
text_sim.similarity("hello", "h3110")

0.5612893104553223

In [28]:
queries = ["apple", "appl", "icecream", "house", "random"]
targets = ["apple", "ice cream", "mouse"]

results_df = text_sim.match(queries, targets, similarity_threshold=0.9)
print(results_df.head())

      query     target  similarity  is_match
0     apple      apple    1.000000      True
1      appl      apple    0.914230      True
2  icecream  ice cream    0.950734      True
3     house      mouse    0.760066     False
4    random      mouse    0.456315     False


In [29]:
# You can directly compute text embeddings using TextSim's `embed()` method
example_texts = [
    "This is an example text!",
    "You can even use extremely long texts, TextSim is capable of handling and matching arbitrarily-long texts."
]
embeddings = text_sim.embed(example_texts)
embeddings.shape  # (2, 256) dimension

(2, 256)

In [30]:
# you can maintain an index of texts as well. 
# we support Approximate Nearest Neighbor search, with index_type="approx" using USearch, which will scale to millions or even billions of indexed examples

text_sim = TextSim()

# dataset
index_examples = [
    "I love ice cream and cookies",
    "Ice cream is super delicious",
    "my mom makes the best homemade cookies 🍪🍪🍪",
    "This is an example text.",
    "UniSim supports very long texts as well.",
    "UniSim supports multilingual texts too. 你好!",
]
text_sim.add(index_examples)

# you want to find nearest things in your index
query_examples = [
    "I luv ice cream and cookies🍦🍪",
    "This is an example query text.",
    "Unrelated text with no match in the dataset..."
]

result_collection = text_sim.search(query_examples, similarity_threshold=0.9, k=5)



INFO: UniSim is storing a copy of the indexed data
INFO: If you are using large data corpus, consider disabling this behavior using store_data=False


In [31]:
total_matches = result_collection.total_matches
total_matches

2

In [32]:
result = result_collection.results[0]
text_sim.visualize(result)

Query 0: "I luv ice cream and cookies🍦🍪"
Most similar matches:

  idx  is_match      similarity  text
-----  ----------  ------------  ---------------------------------------------
    0  True                0.91  I love ice cream and cookies
    1  False               0.66  Ice cream is super delicious
    2  False               0.53  my mom makes the best homemade cookies 🍪🍪🍪
    3  False               0.42  This is an example text.
    4  False               0.36  UniSim supports very long texts as well.


In [33]:
print(result_collection.total_matches)
results = result_collection.results

2


In [34]:
result_collection = text_sim.search(["This is some text??"], similarity_threshold=0.9, k=5)

In [35]:

text_sim.visualize(result_collection.results[0])

# text_sim.visualize(results[1])

Query 0: "This is some text??"
Most similar matches:

  idx  is_match      similarity  text
-----  ----------  ------------  ---------------------------------------------
    3  False               0.74  This is an example text.
    5  False               0.54  UniSim supports multilingual texts too. 你好!
    4  False               0.51  UniSim supports very long texts as well.
    1  False               0.45  Ice cream is super delicious
    2  False               0.43  my mom makes the best homemade cookies 🍪🍪🍪


In [36]:
example_data = [get_text_pair(idx) for idx in range(0, 5)]

for i in range(len(example_data)):
    text1, text2, is_match = example_data[i]
    similarity = text_sim.similarity(text1, text2)
    example_data[i].append(similarity)

print(tabulate(example_data, headers=["text1", "text2", "is_match", "similarity"]))

text1                                                                                                                                           text2                                                                                                                                                                                            is_match    similarity
----------------------------------------------------------------------------------------------------------------------------------------------  ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  ----------  ------------
the demarcation protocol : a technique for maintaining constraints in distributed database systems  d barbarс , h garcia-molina  vldb j.  1994  local verification of global integrity constraints in distributed databases  a gupta , j widom                                          

In [37]:
targets = list(set([get_text_pair(idx)[0] for idx in range(0, len(dataset))]))  # TODO does not preserve order 
print("\n".join([t for t in targets[:5]]))

querying multi-dimensional data indexed using the hilbert space-filling curve  j lawder , p king  sigmod record  2001
energy and rate based mac protocol for wireless sensor networks  r kannan , r kalidindi , s iyengar , v kumar  sigmod record  2003
query processing over object views of relational data  g fahl , t risch  vldb j.  1997
message from editor-in-chief , acm transactions on database systems  w kim    1999
the spiffi scalable video-on-demand system  c freedman , d dewitt  sigmod conference  1995


In [38]:
queries = list(set([get_text_pair(idx)[1] for idx in range(0, len(dataset))]))
print("\n".join([t for t in queries[:5]]))

olap + + : powerful and easy-to-use federations of olap and object databases  j gu , tb pedersen , a shoshani  proceedings of the 26th international conference on very & hellip ; ,  2000.0 
loading databases using dataflow parallelism  j grayâ ? ¦  sigmod record ,   
squeezer : an efficient algorithm for clustering categorical data  h zengyou , x xiaofei , d shengchun  journal of computer science and technology ,  2002.0 
the federated data warehouse  c white     
mining fuzzy association rules in databases .  acms anthology  sigmod record ,  1998.0 


In [39]:
results_df = text_sim.match(queries, targets)


#  TODO find the ones that do have matches in the dataset and look them up
# TODO find a few that do not have matches and look them up --- will show that they have no match

with pd.option_context('display.max_colwidth', None):
    display(results_df.head(10))

Unnamed: 0,query,target,similarity,is_match
0,"olap + + : powerful and easy-to-use federations of olap and object databases j gu , tb pedersen , a shoshani proceedings of the 26th international conference on very & hellip ; , 2000.0","olap + + : powerful and easy-to-use federations of olap and object databases j gu , t pedersen , a shoshani vldb 2000",0.87611,False
1,"loading databases using dataflow parallelism j grayâ ? ¦ sigmod record ,","loading databases using dataflow parallelism t barclay , r barnes , j gray , p sundaresan sigmod record 1994",0.81794,False
2,"squeezer : an efficient algorithm for clustering categorical data h zengyou , x xiaofei , d shengchun journal of computer science and technology , 2002.0","efficient and extensible algorithms for multi query optimization p roy , s seshadri , s sudarshan , s bhobe sigmod conference 2000",0.67053,False
3,the federated data warehouse c white,"data warehouse configuration d theodoratos , t sellis vldb 1997",0.657057,False
4,"mining fuzzy association rules in databases . acms anthology sigmod record , 1998.0","mining fuzzy association rules in databases c kuok , a fu , m wong sigmod record 1998",0.871261,False
5,"querying multidimensional databases l cabibbo , r torlone","optimizing multiple dimensional queries simultaneously in multidimensional databases w liang , m orlowska , j yu vldb j. 2000",0.701327,False
6,"adaptable query optimization and evaluation in temporal middleware g slivinskas , cs jensen , rt snodgrass sigmod conference , 2001.0","adaptable query optimization and evaluation in temporal middleware g slivinskas , c jensen , r snodgrass sigmod conference 2001",0.976707,True
7,"design principles for data-intensive web sites . acms anthology sigmod record , 1999.0","design principles for data-intensive web sites s ceri , p fraternali , s paraboschi sigmod record 1999",0.844806,False
8,"â ?? sim : a database system based on semantic model . d jagannathan , rl guck , bl fritchman , jp thompson , proceedings of sigmod international conference on management","spartan : a model-based semantic compression system for massive data tables s babu , m garofalakis , r rastogi sigmod conference 2001",0.654835,False
9,"database abstractions : aggregation and generalization jm smith , dcp smith acm transactions on database systems , 1977.0","solving satisfiability and implication problems in database systems s guo , w sun , m weiss acm trans . database syst . 1996",0.762832,False


### Indexing and Querying Similar Texts

In [40]:
text_sim.reset_index()

# add list of texts to the index
text_sim.add(targets)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [41]:
result_collection = text_sim.search(queries, similarity_threshold=0.9, k=5)

In [42]:
text_sim.visualize(result_collection.results[0])

Query 0: "olap + + : powerful and easy-to-use federations of olap and object databases  j gu , tb pedersen , a shoshani  proceedings of the 26th international conference on very & hellip ; ,  2000.0 "
Most similar matches:

  idx  is_match      similarity  text
-----  ----------  ------------  ----------------------------------------------------------------
 1869  False               0.88  olap + + : powerful and easy-to-use federations of olap and obje
  855  False               0.58  report on the first international conference on ontologies , dat
 1284  False               0.57  proceedings of the 2000 acm sigmod international conference on m
 1614  False               0.56  building scalable internet applications with oracle8i server  j
  136  False               0.56  application servers and associated technologies      2002


In [43]:
new_examples = ["Googleplex (650) 253-0000 1600 Amphitheatre Parkway, Mountain View, CA 94043"]
text_sim.add(new_examples)

[2524]

In [44]:
result_collection = text_sim.search(["googleplx 650-253-0000 1600 amphitheatre parkway, mountain view, ca 94043"])
result = result_collection.results[0]

text_sim.visualize(result)

Query 0: "googleplx 650-253-0000 1600 amphitheatre parkway, mountain view, ca 94043"
Most similar matches:

  idx  is_match      similarity  text
-----  ----------  ------------  ----------------------------------------------------------------
 2524  True                0.92  Googleplex (650) 253-0000 1600 Amphitheatre Parkway, Mountain Vi
