## Full-Text Search using SQLite

In [1]:
#!pip3 install --upgrade nltk

In [2]:
import re
import sqlite3
from collections import Counter
from nltk.tokenize import TweetTokenizer

In [3]:
tokenizer = TweetTokenizer() # doesn't split the contractions into two parts

def generate_vocabulary(train_captions, min_threshold):
    concat_str = ' '.join([str(elem).strip('\n') for elem in train_captions])
    individual_words = tokenizer.tokenize(concat_str.lower())
    condition_keys = sorted([key for key, value in Counter(individual_words).items() if value >= min_threshold])
    result = dict(zip(condition_keys, range(len(condition_keys))))
    return result


def delete_nonvocab_words(text, vocab):
    new_text = re.sub(' +', ' ', " ".join([word if word in vocab.keys() else "" for word in tokenizer.tokenize(text)]).strip())
    if new_text == "":
        new_text = "<empty>"
    return new_text

In [4]:
intent_dict = {"intent_1":["play some music", "let's listen to music"], 
               "intent_2":["open first window", "open second window"],
               "intent_3":["close all windows"]}

texts = list(sum(intent_dict.values(), []))

voc = generate_vocabulary(texts, min_threshold=1)

In [5]:
voc

{'all': 0,
 'close': 1,
 'first': 2,
 "let's": 3,
 'listen': 4,
 'music': 5,
 'open': 6,
 'play': 7,
 'second': 8,
 'some': 9,
 'to': 10,
 'window': 11,
 'windows': 12}

In [6]:
con = sqlite3.connect('fts5.db')
cur = con.cursor()

In [7]:
cur.execute('''DROP TABLE IF EXISTS intents''')
cur.execute('''CREATE VIRTUAL TABLE intents USING FTS5(intent UNINDEXED, variation)''')

c = "INSERT INTO intents VALUES "
for intent, variations in intent_dict.items():
    for variation in variations:
        c += f'("{intent}", "{variation}"), '
        
c = c[:-2]

cur.execute(c)
con.commit()

In [8]:
queries = ["open", "close", "play music", "play a song"]

# could use query relaxation

for query in queries:
    preprocessed_query = delete_nonvocab_words(query, voc)
    print("Initial query:", query)
    print("Preprocessed query:", preprocessed_query)
    if preprocessed_query != "<empty>":
        for row in cur.execute(f"SELECT t.*, rank FROM intents t WHERE variation MATCH '{preprocessed_query}*' ORDER BY rank;"):
            print(row)
    print("\n")

Initial query: open
Preprocessed query: open
('intent_2', 'open first window', -0.35348487779869003)
('intent_2', 'open second window', -0.35348487779869003)


Initial query: close
Preprocessed query: close
('intent_3', 'close all windows', -1.1541601010164977)


Initial query: play music
Preprocessed query: play music
('intent_1', 'play some music', -1.5076449788151878)


Initial query: play a song
Preprocessed query: play
('intent_1', 'play some music', -1.1541601010164977)


