## Question 1.1

In [15]:
from porter import *
from collections import Counter
import re
import numpy as np

stopwords = ['the', 'a', 'an', 'of', 'behind', 'under', 'there', 'in', 'on']

def preprocess(text) :
    # Separate words
    words = re.split(r'\W+', text)
    # Turn into lowercase
    words_lowercase = (w.lower() for w in words)
    # Remove stopwords
    words_nostopword = (w for w in words_lowercase if w not in stopwords)
    # Apply stemming
    words_stemmed = (stem(w.lower()) for w in words_nostopword)
    # Delete stopwords
    return dict(Counter(words_stemmed))


doc1 = "The new home has been saled on top of forecasts"
doc2 = "The home sales rise in July"
doc3 = "There is an increase in home sales in July"
doc4 = "July encounter a new home sales rise"

corpus = [doc1,doc2,doc3,doc4]

preprocess(doc1) 

{'new': 1, 'home': 1, 'ha': 1, 'been': 1, 'sale': 1, 'top': 1, 'forecast': 1}

# Question 1.2

In [16]:
def create_index(corp) :
    """Creates an index file for each documents in the corpus"""
    result = dict()
    for i,doc in enumerate(corp) :
        result[i] = preprocess(doc)

    return result

In [17]:
create_index(corpus)

{0: {'new': 1,
  'home': 1,
  'ha': 1,
  'been': 1,
  'sale': 1,
  'top': 1,
  'forecast': 1},
 1: {'home': 1, 'sale': 1, 'rise': 1, 'juli': 1},
 2: {'is': 1, 'increas': 1, 'home': 1, 'sale': 1, 'juli': 1},
 3: {'juli': 1, 'encount': 1, 'new': 1, 'home': 1, 'sale': 1, 'rise': 1}}

In [18]:
def create_index_inverse(corpus):
    d = {}
    ind = create_index(corpus)
    for k1,v1 in ind.items():
        for k2,v2 in ind[k1].items():
            if k2 in d.keys():
                d[k2][k1] = v2
            else:
                d[k2] = {k1 : v2}
    return d

create_index_inverse(corpus)

{'new': {0: 1, 3: 1},
 'home': {0: 1, 1: 1, 2: 1, 3: 1},
 'ha': {0: 1},
 'been': {0: 1},
 'sale': {0: 1, 1: 1, 2: 1, 3: 1},
 'top': {0: 1},
 'forecast': {0: 1},
 'rise': {1: 1, 3: 1},
 'juli': {1: 1, 2: 1, 3: 1},
 'is': {2: 1},
 'increas': {2: 1},
 'encount': {3: 1}}

In [45]:
def create_index_inverse_tfidf(corp) :
    """Creates an index file for each documents in the corpus"""
    index = create_index_inverse(corp)
    for term in index :
        for doc in index[term] :
            index[term][doc] = index[term][doc] * np.log(len(corp)/len(index[term]))
    return index
    
create_index_inverse_tfidf(corpus)

{'new': {0: 0.6931471805599453, 3: 0.6931471805599453},
 'home': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 'ha': {0: 1.3862943611198906},
 'been': {0: 1.3862943611198906},
 'sale': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 'top': {0: 1.3862943611198906},
 'forecast': {0: 1.3862943611198906},
 'rise': {1: 0.6931471805599453, 3: 0.6931471805599453},
 'juli': {1: 0.28768207245178085,
  2: 0.28768207245178085,
  3: 0.28768207245178085},
 'is': {2: 1.3862943611198906},
 'increas': {2: 1.3862943611198906},
 'encount': {3: 1.3862943611198906}}

# EXERCICE 2 :

In [6]:
pip install --upgrade ir_datasets


Note: you may need to restart the kernel to use updated packages.


In [62]:
import ir_datasets

dataset = ir_datasets.load('beir/scifact')
for query in dataset.queries_iter():
    query

[INFO] [starting] opening zip file
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip
[INFO] [finished] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip: [00:04] [2.82MB] [698kB/s]
[INFO] [finished] opening zip file [4.16s]                                                                


In [63]:
import heapq as hp

def taat(query,index,k) :
    h = dict()
    
    vocabulary = index.keys()
    query_words = preprocess(query).keys()
    workspace = {word:index[word] for word in query_words}
    for term in workspace :
        for doc in workspace[term] :
            if doc in h : h[doc] += workspace[term][doc]
            else : h[doc] = workspace[term][doc]
        
    top_K = heapq.nlargest(k, h.items(), key=lambda x: x[1])
    print(top_K)
        
    

In [64]:
taat("is",create_index_inverse_tfidf(corpus), 1)


[(2, 1.3862943611198906)]


In [73]:
corpus = [doc[1] for doc in dataset.docs_iter()]
index = create_index_inverse_tfidf(corpus)