In [None]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q
import pandas as pd
import numpy as np
from elasticsearch.client import CatClient

In [None]:
def normalize(tw):
    """
    Normalizes the weights in t so that they form a unit-length vector
    It is assumed that not all weights are 0
    :param tw:
    :return:
    """
    mod = np.sqrt(np.sum([x**2 for x in tw.values()]))
    return {t: tw[t]/mod for t in tw.keys()}

In [None]:
def document_term_vector(client, index, id):
    """
    Returns the term vector of a document and its statistics a two sorted list of pairs (word, count)
    The first one is the frequency of the term in the document, the second one is the number of documents
    that contain the term

    :param client:
    :param index:
    :param id:
    :return:
    """
    termvector = client.termvectors(index=index, id=id, fields=['text'],
                                    positions=False, term_statistics=True)

    file_td = {}
    file_df = {}

    if 'text' in termvector['term_vectors']:
        for t in termvector['term_vectors']['text']['terms']:
            file_td[t] = termvector['term_vectors']['text']['terms'][t]['term_freq']
            file_df[t] = termvector['term_vectors']['text']['terms'][t]['doc_freq']
    return sorted(file_td.items()), sorted(file_df.items())

In [None]:
def doc_count(client, index):
    """
    Returns the number of documents in an index

    :param client:
    :param index:
    :return:
    """
    return int(CatClient(client).count(index=[index], format='json')[0]['count'])

In [None]:
def toTFIDF(client, index, file_id):
    """
    Returns the term weights of a document

    :param file:
    :return:
    """

    # Get the frequency of the term in the document, and the number of documents
    # that contain the term
    file_tv, file_df = document_term_vector(client, index, file_id)

    max_freq = max([f for _, f in file_tv])

    dcount = doc_count(client, index)

    tfidfw = {}

    for (t, w),(_, df) in zip(file_tv, file_df):
        tf = w / max_freq
        idf = np.log2(dcount/df)
        tfidfw[t]= tf*idf

    return normalize(tfidfw)

In [None]:
def search(words_set, index, client, K):

    s = Search(using=client, index=index)
    print(words_set.columns[0])
    q = Q('query_string',query=words_set.columns[0]) 

    for elem in words_set.columns[1:]:
        print(elem)
        q &= Q('query_string',query=elem)

    s = s.query(q)
    response = s[0:K].execute()
    results = pd.DataFrame(index=['Weight'])
    for r in response:  # only returns a specific number of results
        results = results.add(pd.DataFrame(toTFIDF(client, index, r.meta.id), index=['Weight']), fill_value=0)
    
    return results.sort_values(by ='Weight', axis=1, ascending=False).div(K)
        

In [None]:
'''Declaration of variables to be used later'''

index = 'news'
beta = 0.6
alpha = 0.4
initial_query = input().split(' ')
words = pd.DataFrame(index=['Weight'], columns=[x.split('^')[0] for x in initial_query], data=[[1 if '^' not in el else el.split('^')[1] for el in initial_query]])
k = 5
client = Elasticsearch()

nrounds = 20

for _ in range(nrounds):
    R = search(words, index, client, k)*beta
    words = words*alpha
    words = words.add(R, fill_value=0).sort_values(by ='Weight', axis=1, ascending=False)
    words = words[words.columns[:5]]
    print(words)



***

## 2 Rocchio's Rule


For implementing the relevance we are going to use the Rocchio's rule. We are going to extend the query for a number of interations using the terms in the more relevant documents that are retrieved.

As is described in the session documentation you will need to write a scripts that given a query, repeats a number ($nrounds$) of times:

1. Obtain the $k$ more relevant documents
2. Compute a new query using the current query and the terms of the $k$ documents

The Rocchio's rule involves computing the folowing:

$$Query' = 	\alpha \times Query + \beta \times \frac{d_1 + d_2 + \cdots + d_k}{k}$$

So we have different parameters to play with:

1. The number of rounds ($nrounds$)
2. The number of relevand documents ($k$)
3. The parameters of the Rocchio's rule ($\alpha$ and $\beta$)
4. The numbeer of terms in the recomputed query ($R$)

**Read the documentation** and pay attention specially to how you have to build the query that you pass to ElasticSearch to include thw weights computed by the Rocchio's rule.

Think that some of the elements that you need for this part are functions that you programmed already as part of the past session assignment.

**Pay attention** to the documentation that you have to deliver for this session.

