In [1]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q
import pandas as pd
import numpy as np
from elasticsearch.client import CatClient

In [2]:
def normalize(tw):
    """
    Normalizes the weights in t so that they form a unit-length vector
    It is assumed that not all weights are 0
    :param tw:
    :return:
    """
    mod = np.sqrt(np.sum([x**2 for x in tw.values()]))
    return {t: tw[t]/mod for t in tw.keys()}

In [3]:
def document_term_vector(client, index, id):
    """
    Returns the term vector of a document and its statistics a two sorted list of pairs (word, count)
    The first one is the frequency of the term in the document, the second one is the number of documents
    that contain the term

    :param client:
    :param index:
    :param id:
    :return:
    """
    termvector = client.termvectors(index=index, id=id, fields=['text'],
                                    positions=False, term_statistics=True)

    file_td = {}
    file_df = {}

    if 'text' in termvector['term_vectors']:
        for t in termvector['term_vectors']['text']['terms']:
            file_td[t] = termvector['term_vectors']['text']['terms'][t]['term_freq']
            file_df[t] = termvector['term_vectors']['text']['terms'][t]['doc_freq']
    return sorted(file_td.items()), sorted(file_df.items())

In [4]:
def doc_count(client, index):
    """
    Returns the number of documents in an index

    :param client:
    :param index:
    :return:
    """
    return int(CatClient(client).count(index=[index], format='json')[0]['count'])

In [5]:
def toTFIDF(client, index, file_id):
    """
    Returns the term weights of a document

    :param file:
    :return:
    """

    # Get the frequency of the term in the document, and the number of documents
    # that contain the term
    file_tv, file_df = document_term_vector(client, index, file_id)

    max_freq = max([f for _, f in file_tv])

    dcount = doc_count(client, index)

    tfidfw = {}

    for (t, w),(_, df) in zip(file_tv, file_df):
        tf = w / max_freq
        idf = np.log2(dcount/df)
        tfidfw[t]= tf*idf

    return normalize(tfidfw)

In [6]:
def search(words_set, index, client, K):

    s = Search(using=client, index=index)
    #print(words_set.columns[0] + '^' + str(words_set[words_set.columns[0]].values[0]))
    q = Q('query_string',query=words_set.columns[0] + '^' + str(words_set[words_set.columns[0]].values[0])) 

    for elem in words_set.columns[1:]:
        #print(elem + '^' + str(words_set[elem].values[0]))
        q &= Q('query_string',query=elem + '^' + str(words_set[elem].values[0]))

    s = s.query(q)
    response = s[0:K].execute()
    results = pd.DataFrame(index=['Weight'])
    for r in response:  # only returns a specific number of results
        results = results.add(pd.DataFrame(toTFIDF(client, index, r.meta.id), index=['Weight']), fill_value=0)
    
    return results.sort_values(by ='Weight', axis=1, ascending=False).div(K)
        

In [26]:
'''Declaration of variables to be used later'''

index = 'news'
beta = 0.6
alpha = 0.4
initial_query = input().split(' ')
words = pd.DataFrame(index=['Weight'], columns=[x.split('^')[0] for x in initial_query], data=[[1 if '^' not in el else el.split('^')[1] for el in initial_query]])
k = 5
client = Elasticsearch()
R = 5
nrounds = 20

for _ in range(nrounds):
    Res = search(words, index, client, k)*beta
    words = words*alpha
    words = words.add(Res, fill_value=0).sort_values(by ='Weight', axis=1, ascending=False)
    words = words[words.columns[:R]]
words


Unnamed: 0,nsa,trust,i'd,happy,government
Weight,0.212683,0.180221,0.135717,0.12022,0.079946


In [27]:
s = Search(using=client, index=index)
q = Q('query_string',query=words.columns[0] + '^' + str(words[words.columns[0]].values[0])) 

for elem in words.columns[1:]:
    q &= Q('query_string',query=elem + '^' + str(words[elem].values[0]))

s = s.query(q)
response = s[0].execute()
for r in response:  # only returns a specific number of results
    print(f'PATH= {r.path}')

PATH= C:\CAIM\Data\Dirty\20_newsgroups\sci.crypt/0011219
