# Bigram matches in Elasticsearch

This exercise is about getting ordered and unordered bigram matches using Elasticsearch.

In [None]:
%pip install ipytest
%pip install elasticsearch==7.15

In [None]:
from elasticsearch import Elasticsearch
from pprint import pprint

import ipytest
import pytest

ipytest.autoconfig()

## Indexing a toy collection

This time, we store **term position information** and perform minimal stemming, i.e., removing only plurals (for that, we specify a custom analyzer).

Check the [Elasticsearch documentation on analyzers](https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer.html).

In [None]:
INDEX_NAME = "toy_index"

INDEX_SETTINGS = {
    'settings' : {
        'index' : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        'analysis': {
            'analyzer': {
                'my_english_analyzer': {
                    'type': "custom",
                    'tokenizer': "standard",
                    'stopwords': "_english_",
                    'filter': [
                        "lowercase",
                        "english_stop",
                        "filter_english_minimal"
                    ]
                }
            },
            'filter' : {
                'filter_english_minimal' : {
                    'type': "stemmer",
                    'name': "minimal_english"
                },
                'english_stop': {
                    'type': "stop",
                    'stopwords': "_english_"
                }
            },
        }
    },
    'mappings': {
        'properties': {
            'title': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            },
            'content': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            }
        }
    }
}

In [None]:
DOCS = {
    1: {"title": "Rap God",
        "content": "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings"
        },
    2: {"title": "Lose Yourself",
        "content": "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me"
        },
    3: {"title": "Love The Way You Lie",
        "content": "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts"
        },
    4: {"title": "The Monster",
        "content": ["gonna gonna I'm friends with the monster", "That's under my bed Get along with the voices inside of my head"]
        },
    5: {"title": "Beautiful",
        "content": "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone"
        },
    6: {"title": "Fake Eminem 1",
        "content": "This is not real Eminem, just some text to get more matches for a split second for a split second."
        },
    7: {"title": "Fake Eminem 2",
        "content": "I have a monster friend and I'm friends with the monster and then there are some more friends who are monsters."
        },
}

In [None]:
ES_NODES = "http://localhost:9200"
es = Elasticsearch(hosts = [ES_NODES])

In [None]:
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)

In [None]:
es.indices.create(index=INDEX_NAME, settings=INDEX_SETTINGS["settings"], mappings=INDEX_SETTINGS["mappings"])

Testing our analyzer.

In [None]:
es.indices.analyze(index=INDEX_NAME, body={'analyzer': 'my_english_analyzer', 'text': 'monsters in my bed'})

In [None]:
for doc_id, doc in DOCS.items():
    es.index(index=INDEX_NAME, id=doc_id, document=doc)

Notice that you also get term position information when requesting a term vector.

In [None]:
tv = es.termvectors(index=INDEX_NAME, id=2, fields='title,content')
pprint(tv)

## Recovering ordered sequence of terms from inverted index

This method returns the sequence of terms for a given document field, with `None` values for stopwords that got removed.

In [None]:
def get_term_sequence(es, doc_id, field):
    tv = es.termvectors(index=INDEX_NAME, id=doc_id, fields=[field])

    terms = tv["term_vectors"][field]["terms"]

    total_length = max(token["position"] for term in terms.values() for token in term["tokens"]) + 1

    result = [None] * total_length

    for term, val in terms.items():
        for pos in val["tokens"]:
            result[pos["position"]] = term

    return result


Tests.

In [None]:
%%run_pytest[clean]

def test_get_term_sequence():
    assert get_term_sequence(es, 4, 'title') == [None, 'monster']
    assert get_term_sequence(es, 7, 'content') == ['i', 'have', None, 'monster', 'friend', None, "i'm", 'friend', None, None, 'monster', None, None, None, None, 'some', 'more', 'friend', 'who', None, 'monster']

## Getting ordered bigram matches

Use the `get_term_sequence()` method to get the document field's content as a sequence of terms, then check for ordered bigram matches yourself.

In [None]:
def count_ordered_bigram_matches(es, doc_id, field, bigram):
    """Counts the number of ordered bigram matches in a given document field.

    Args:
        es: Elasticsearch instance
        doc_id: Document ID
        field: Document field
        bigram: A sequence of two terms given as a list

    Returns:
        Number of times the bigram can be found in this exact order.
    """
    count = 0

    term_sequence = get_term_sequence(es, doc_id, field)

    for i in range(0, len(term_sequence) - 1):
        if bigram[0] == term_sequence[i] and bigram[1] == term_sequence[i + 1]:
            count += 1

    return count

Tests.

In [None]:
%%run_pytest[clean]

@pytest.mark.parametrize('doc_id, field, bigram, correct_value', [
    (6, 'content', ['split', 'second'], 2),
    (2, 'content', ['split', 'second'], 1),
    (1, 'content', ['split', 'second'], 0),
])
def test_count_ordered_bigram_matches(doc_id, field, bigram, correct_value):
    assert count_ordered_bigram_matches(es, doc_id, field, bigram) == correct_value

## Getting unordered bigram matches

As before, use the `get_term_sequence()` method to get the document field's content as a sequence of terms, then check for ordered bigram matches yourself.

In [None]:
def count_unordered_bigram_matches(es, doc_id, field, bigram, w=4):
    """Counts the number of unordered bigram matches in a given document field.

    Args:
        es: Elasticsearch instance
        doc_id: Document ID
        field: Document field
        bigram: A sequence of two terms given as a list
        w: The maximum distance between the two query terms that still counts as a match

    Returns:
        Number of times the bigram can be found within a distance of w from each other in any order.
    """
    count = 0

    term_sequence = get_term_sequence(es, doc_id, field)
    for i in range(0, max(1, len(term_sequence) - (w - 1))):
        sub_sequence = term_sequence[i : min(i + w, len(term_sequence))]
        if any(b == sub_sequence[0] for b in bigram):
            if all(b in sub_sequence for b in bigram):
                count += 1

    return count

Tests.

In [None]:
%%run_pytest[clean]

@pytest.mark.parametrize('doc_id, field, bigram, correct_value', [
    (7, 'content', ['friend', 'monster'], 3),
    (4, 'content', ['friend', 'monster'], 1),
    (1, 'content', ['friend', 'monster'], 0),
])
def test_count_ordered_bigram_matches(doc_id, field, bigram, correct_value):
    assert count_unordered_bigram_matches(es, doc_id, field, bigram) == correct_value