In [10]:
from porter import stem
from collections import Counter
import re
import numpy as np
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Exercice 1

In [11]:
corpus = [
    "the new home has been saled on top forecasts",
    'the home sales rise in july',
    'there is an increase in home sales in july',
    'july encounter a new home sales rise'
]

In [25]:
def analyzer(s):
    word_token = re.split(r'\W+', s.lower())
    word_token = [stem(w) for w in word_token if not w in stop_words]
    return word_token

def index(corpus):
    """
    Return document index of corpus in param
    """
    idx = {}
    for i, doc in enumerate(corpus):
        idx[i] = Counter(analyzer(doc))
    return idx

def inverse_index(corpus):
    inv_idx = {}
    for i, doc in enumerate(corpus):
        for word in analyzer(doc):
            d = inv_idx.get(word, {})
            nb = d.get(i, 0) + 1

            d[i] = nb
            inv_idx[word] = d
    return inv_idx

In [13]:
print(index(corpus))
print(inverse_index(corpus))

{0: Counter({'new': 1, 'home': 1, 'sale': 1, 'top': 1, 'forecast': 1}), 1: Counter({'home': 1, 'sale': 1, 'rise': 1, 'juli': 1}), 2: Counter({'increas': 1, 'home': 1, 'sale': 1, 'juli': 1}), 3: Counter({'juli': 1, 'encount': 1, 'new': 1, 'home': 1, 'sale': 1, 'rise': 1})}
{'new': {0: 1, 3: 1}, 'home': {0: 1, 1: 1, 2: 1, 3: 1}, 'sale': {0: 1, 1: 1, 2: 1, 3: 1}, 'top': {0: 1}, 'forecast': {0: 1}, 'rise': {1: 1, 3: 1}, 'juli': {1: 1, 2: 1, 3: 1}, 'increas': {2: 1}, 'encount': {3: 1}}


## Question 3

In [14]:
def index_tfidf(corpus):
    """
    Return document index of corpus in param
    """
    N = len(corpus)
    idx = index(corpus)
    inv_idx = inverse_index(corpus)
    vocab = inv_idx.keys()
    idf = {word: np.log((1 + N) / (1 + len(value))) for word, value in zip(vocab, inv_idx.values())}

    for doc in idx.keys():
        for word in vocab:
            idx[doc][word] = idx[doc][word] * idf[word]
    return idx

def inverse_index_tfidf(corpus):
    pass

In [15]:
print(index_tfidf(corpus))
print(inverse_index_tfidf(corpus))

{0: Counter({'top': 0.916290731874155, 'forecast': 0.916290731874155, 'new': 0.5108256237659907, 'home': 0.0, 'sale': 0.0, 'rise': 0.0, 'juli': 0.0, 'increas': 0.0, 'encount': 0.0}), 1: Counter({'rise': 0.5108256237659907, 'juli': 0.22314355131420974, 'home': 0.0, 'sale': 0.0, 'new': 0.0, 'top': 0.0, 'forecast': 0.0, 'increas': 0.0, 'encount': 0.0}), 2: Counter({'increas': 0.916290731874155, 'juli': 0.22314355131420974, 'home': 0.0, 'sale': 0.0, 'new': 0.0, 'top': 0.0, 'forecast': 0.0, 'rise': 0.0, 'encount': 0.0}), 3: Counter({'encount': 0.916290731874155, 'new': 0.5108256237659907, 'rise': 0.5108256237659907, 'juli': 0.22314355131420974, 'home': 0.0, 'sale': 0.0, 'top': 0.0, 'forecast': 0.0, 'increas': 0.0})}
None


# Exercice 2

In [30]:
import ir_datasets, time
import pandas as pd
dataset = ir_datasets.load("beir/scifact")
l = []
for doc in dataset.docs_iter():
    l.append(doc.text)
dataset = l

In [17]:
def eval(f, param):
    """
    Evalue les temps d'execution de la fonction f, avec le dictionnaire de paramètre reçu 
    """
    l = []
    for _ in range(5):
        start = time.time()
        f(**param)
        end = time.time()
        l.append(end - start)
    return np.mean(l)

In [33]:
def question_to_token(question):
    return analyzer(question)

def TAAT(corpus, question):
    question_word_list = question_to_token(question)
    inv_idx = inverse_index(corpus)
    # Filtering inv_idx to get only word that are in the question
    inv_idx = {question_word : inv_idx[question_word] 
                for question_word in question_word_list}
    score_dict = {}
    for d in inv_idx.values():
        for doc_id, score in zip(d.keys(), d.values()):
            score_dict[doc_id] = score_dict.get(doc_id, 0) + score
    return pd.Series(score_dict).sort_values(ascending=False)
TAAT(corpus, 'new home')

0    2
3    2
1    1
2    1
dtype: int64

```py
def add_if(ds, heap, k):
    if len(heap) k:
        heapq.push(heap, ds)
    elif heap[0][1] ds.score:
        heapq.heapreplace(heap,ds)
```