# Naive demonstration of boolean retrieval

In [1]:
import nbimporter
from corpora import FileStream
from indexing import Tokenizer, MIndex
import numpy as np

importing Jupyter notebook from corpora.ipynb
importing Jupyter notebook from indexing.ipynb


In [2]:
folder = 'data/wikisearch/brat_20'
corpus = FileStream(folder, file_ext='txt')
tokenizer = Tokenizer(preserve_case=False)
Btoken, Blemma = MIndex(), MIndex()

In [3]:
for doc_id in corpus.docs:
    doc = corpus.doc(doc_id)
    tokens, lemmata = tokenizer.pattern_processing(doc, lemmata=True)
    Btoken.boolean(doc_id, tokenizer.remove_punctuation(tokens))
    Blemma.boolean(doc_id, tokenizer.remove_punctuation(lemmata))

In [5]:
print len(Btoken), len(Blemma)
print [x for x in Btoken.keys() if x not in Blemma.keys()][:10]

12388 9327
[u'writings', u'Hamas', u'Olympics', u'Foundation', u'granting', u'Dr.', u'Ronald', u'Western', u'Cases', u'Caen']


In [6]:
print Blemma.keys()[:10]

[u'halligan', u'constan\u021ba', u'addictiveness', u'1,775', u'galactica', u'yellow', u'four', u'woods', u'ornate', u'towns']


## Using term-document matrix

In [None]:
m, features, docs = B.boolean_to_matrix()

In [None]:
school = m[:,features.index('school')]
students = m[:,features.index('students')]

In [None]:
a = np.logical_and(school, students)

In [None]:
results = [docs[x] for x in np.where(a)[0]]

# Evaluation

In [7]:
import json

In [8]:
with open(folder + '/queries.json', 'rU') as inj:
    queries = json.load(inj)

In [9]:
E = queries['10']['page_ids']
Q = queries['10']['query']

In [10]:
print Q

government and education


In [11]:
mt, t_features, t_docs = Btoken.boolean_to_matrix()
ml, l_features, l_docs = Blemma.boolean_to_matrix()

In [12]:
Qtokens, Qlemma = tokenizer.pattern_processing(Q, lemmata=True)
Qt, Ql = tokenizer.remove_punctuation(Qtokens), tokenizer.remove_punctuation(Qlemma)
Qt = [x for x in Qt if x != 'and']
Ql = [x for x in Qt if x != 'and']

In [13]:
t_vectors, l_vectors = [], []
for token in Qt:
    v = mt[:,t_features.index(token)]
    t_vectors.append(v)
for lemma in Ql:
    v = ml[:,l_features.index(lemma)]
    l_vectors.append(v)

## Conjuctive and disjunctive queries

In [None]:
vand = vectors[0]
for x in vectors[1:]:
    vand = np.logical_and(vand, x)
vor = vectors[0]
for x in vectors[1:]:
    vor = np.logical_or(vor, x)

In [None]:
Ra = [docs[x].replace('.txt', '') for x in np.where(vand)[0]]
Ro = [docs[x].replace('.txt', '') for x in np.where(vor)[0]]

In [16]:
def precision(R, T):
    a = float(len([x for x in R if x in T]))
    b = float(len(R))
    try:
        p = a / b
    except ZeroDivisionError:
        p = np.nan
    return p

def recall(R, T):
    a = float(len([x for x in R if x in T]))
    b = float(len(T))
    try:
        p = a / b
    except ZeroDivisionError:
        p = np.nan
    return p

In [None]:
print precision(Ra, E), recall(Ra, E), len(Ra), len(E)

In [None]:
print precision(Ro, E), recall(Ro, E), len(Ro), len(E)

In [None]:
print Qt

# Tokens vs lemmata

In [14]:
vort = t_vectors[0]
for x in t_vectors[1:]:
    vort = np.logical_or(vort, x)
vorl = l_vectors[0]
for x in l_vectors[1:]:
    vorl = np.logical_or(vorl, x)

In [15]:
Rt = [t_docs[x].replace('.txt', '') for x in np.where(vort)[0]]
Rl = [l_docs[x].replace('.txt', '') for x in np.where(vorl)[0]]

In [17]:
print 'Tokens', precision(Rt, E), recall(Rt, E), len(Rt), len(E)
print 'Lemmata', precision(Rl, E), recall(Rl, E), len(Rl), len(E)

Tokens 0.182795698925 0.85 93 20
Lemmata 0.171171171171 0.95 111 20
