# INDEXATION WEB : Python rev

## Import des modules

In [None]:
from collections import Counter
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stops = set(stopwords.words("english"))
import os
import time
from collections import defaultdict
import pickle
import gzip
from multiprocessing import Pool
import glob
from functools import reduce

In [None]:
#####
nltk.download("stopwords")
nltk.download("wordnet")
lem = WordNetLemmatizer()

<h1 style="color:blue; text-align:center">1. TEXT DATA LOADING AND CLEANING</h1>

<h3 style="color:blue">First, get the list of paths to all data files and then read them</h3>

In [None]:
list_rep=glob.glob("data/**/*.txt",recursive=True)
def readText(path):
    with open(path, "r") as f:
        return f.read()

In [None]:
# start = time.time()
# with Pool(2) as pool:
#     list_text = pool.map(readText, list_rep)
# end = time.time()
# print(end - start)

In [None]:
#Loading data in a list
start=time.time()
list_text = []
for path in list_rep:
    with open(path, 'r') as f:
        text = f.read()
        list_text.append(text)
end=time.time()
end-start

<h3 style="color:blue">Pre-processing</h3>

In [None]:
def cleanText(text, stopwords=stops, lem = lem):
    
    """Take a text file and put it in lower case
    then remove stopwords,
    and apply stemming to each word 
    """
    text = re.findall("[a-zA-z0-9]+", text.lower())
    return [lem.lemmatize(elt) for elt in text if elt not in stops]

In [None]:
start = time.time()
with Pool(8) as pool:
    list_processed_texts = pool.map(cleanText, list_text)
end = time.time()
end-start

In [None]:
list_processed_texts[62]

In [None]:
start = time.time()
list_processed_texts = list(map(cleanText, list_text))
end = time.time()
end-start
####Slower than map with pool

In [None]:
lis

In [None]:
list_processed_texts[0]

In [None]:
# ##Récupérer La liste des mots de mon vocabulaire
# list_vocab = list(myCounter.keys())
# len(list_vocab)
len(list_text)

<h3 style="color:blue">Building the Vocabulary </h3>

In [None]:
def vocabOneFile(termlist):
    """
        this function takes a list of words and return a dictionary
        with frequency for each word
    """
    return Counter(termlist)

def vocabGlobal(my_list):
    myCounter = Counter()
    for text in my_list:
        myCounter += Counter(text)
    return dict(myCounter)

In [None]:
start = time.time()
with Pool(7) as pool:
    vocabulary = pool.map(vocabOneFile, list_processed_texts)
vocabulary = dict(reduce(lambda x,y:x+y, vocabulary))
end = time.time()
end-start

In [None]:
start = time.time()
global_vocabulary = vocabGlobal(list_processed_texts)
end = time.time()
end-start
####faster than map with pool

In [None]:
####Vocabulary size
len(vocabulary), len(global_vocabulary)

# Building the indexe

In [None]:
#input = [word1, word2, ...]
#output = {word1: [pos1, pos2], word2: [pos2, pos434], ...}
def index_one_file(termlist):
    fileIndex = defaultdict(list)
    for index, word in enumerate(termlist):
        fileIndex[word].append(index)
    return dict(fileIndex)

In [None]:
#With map
def joindic(x,y): x.update(y); return x
s=time.time()
with Pool(8) as pool:
    dic=pool.map(index_one_file,list_processed_texts)
dic=reduce(joindic,dic[1:],dic[0])
e=time.time()
e-s

In [None]:
e=time.time()
dic={k:index_one_file(list_processed_texts[k]) for k in range(len(list_processed_texts))}
s=time.time()
s-e

In [None]:
len(dic.keys())
dic

In [395]:
sum([len(word) for word in dic[0].values()])

218

In [397]:
len(list_processed_texts[0])

218

In [399]:
list_text[0]

'The fall in the Czech trade deficit to 10.5 billion crowns in September from 14.5 billion in August buoyed market sentiment, and the goods imported show industrial restructuring is on track, analysts said on Thursday.\nCzech Statistical Bureau (CSU) data released earlier showed the January-September trade shortfall hit an all-time high of 110.7 billion crowns, far surpassing the full 1995 deficit of 95.7 billion crowns.\nBut the September shortfall, the smallest one-month deficit this year, surprised analysts, who had forecast on average a gap of 13 to 15 billion crowns.\n"I\'m happily surprised. I think it\'s a relatively optimistic figure, though not so good as to make us revise our full year forecast," said Martin Kupka, an economist at Patria Finance, which has forecast a 150-160 billion crown deficit at year-end.\nThe CSU said September imports rose 15.2 percent year-on-year, the same as in August, while exports rose 5.9 percent after a 6.4 percent increase in August.\nIt added t

# Building the Inverted Index

In [None]:
#input = {doc_id: {word: [pos1, pos2, ...], ... }}
#res = {word1: {doc_id:{doc_size,[pos1, pos2],freq},....}, ..., ...}
def inverted_index(index):
    inv_index = defaultdict(dict)
    for doc_id in index.keys():
        for word in index[doc_id].keys():
            inv_index[word][doc_id]={'doc_size':len(index[doc_id]),
                              'positions':index[doc_id][word],
                              'occurencies':len(index[doc_id][word])}
    return inv_index

In [None]:
t=0
s=time.time()
for i in range(10): 
    inv_index=inverted_index(dic)
e=time.time()
(e-s)/10

In [None]:
inv_index["china"]

In [None]:
list_processed_texts[62].count("china")

In [None]:
##Optimize index creation time
def create_index(data):
    index = defaultdict(list)
    res={}
    
    for i, words in enumerate(data):
        for word in words:
            index[word].append(i)
    index_new = {}
    for word in index.keys():
        index_new[word] = dict( Counter( index[word] ) )
        for doc_num in index_new[word].keys():
            pos = np.array((np.where(np.array(data[doc_num])==word))).tolist()
            index_new[word][doc_num] = {"occurencies":index_new[word][doc_num], "positions":pos[0], "doc_size":len(data[doc_num])}
        res[word] = index_new[word]
            
    return res

In [None]:
#testind = create_index(clean_Docs)
#testind["china"]
#dict(Counter([1,2,2,2,3,3,2,5,7]))

In [None]:
#newIndex["china"]

from time import time
start = time()
test_Index = create_index(list_processed_texts)
end = time()
print(end-start)

# Serialize data with gzip

In [None]:
#store the object
with gzip.open("Index_Articles.pklz", "wb") as fout:
    pickle.dump(inv_index,fout)
    

#restore the object
with gzip.open("Index_Articles.pklz", "rb") as fin:
    indexe_pkl= pickle.load(fin)

In [None]:
indexe_pkl["china"]

# Create Search function

# Quering
## Single-word Queries - Function *sing_woq(ind,word)*
In which documents does a given word occur?
To do that, we loop through the item in the word's information and catch all values for tag *id_doc*. These values append the returned list wich is initialized to an empty list. If the word doesn't exist in the indexe, we return [].

In [None]:
def sing_woq(query, index):
    query = cleanText(query)
    #l=list(index[query[0]].keys())
    if len(query)>0 and len(list(index[query[0]].keys())):
        return list(index[query[0]].keys())
    else:
        #print("No result for specified key!!!")
        return None

In [None]:
query1 = sing_woq("tournament", indexe_pkl)
query1

In [None]:
sing_woq("the", indexe_pkl)

## Free-text Queries
* Which documents contain at least one word from a given list of words?

We use the previous founction on the list of words containing the text cleaned from stopwords. The returned object is the union of several results of the previous functions.

In [None]:
def free_tq(query, index):
    list_words=query.split()
    res = [id_doc for word in list_words if sing_woq(word, index) for id_doc in sing_woq(word, index) ]
    if len(res)>0:
        return sorted(list(set(res)))
    else:
        print("No match found!!!")
        return None
    
    

In [None]:
free_tq("tournament ", inv_index)==sing_woq("tournament",inv_index)

In [None]:
free_tq("The british tournament ", inv_index)

In [None]:
def getPosition(word, doc, index):
    try:
        return index[word][doc]["positions"]
    except:
        return None

In [None]:
print(getPosition("china", 57, newIndex))

## Phrase Queries
* Which documents contain a given phrase in the same order?


1. We're looking for the texts belonging to the intersection of sing_woq for every word in the query text.
2. The we check whether they are in correct order or not.

In [None]:
def phrase_query(query, index):
    list_words = cleanText(query) #cleaning query
    res = set()
    final_result = []

    l=[set(sing_woq(word,index)) for word in list_words]
    ids=list(set.intersection(*l)) 
        
    #we use set because list does't recognize intersection as method
    #ids=list(set.instersection(*l)) #gets id of documents containing all words in the query
    
    
    #Check whether terms are in correct order
    if len(ids) > 0: # If there is at least one document conatins all words in the query
        
        for id_doc in ids: # getting positions of words in documents
            posMinusOne = []
            list_pos=[index[word][id_doc]["positions"] for word in list_words] 
#             for word in list_words:
#                 list_pos=[item[id_doc]["positions"] for item in index[word]] 
            for i, lpos in enumerate(list_pos) : #Check whether terms are in correct order
                posMinusOne.append( set([x-i for x in lpos]) )
                
            if (len(set.intersection(*posMinusOne)) > 0):
                final_result.append(id_doc)
                
    return final_result


In [None]:
query3 = phrase_query(list_text[1832], inv_index)
query3

In [None]:
[inv_index[word][0]["positions"] for word in ["september","fall"]] 

In [None]:
list_text[1832]

In [None]:
inv_index["china"][97]



## Autres façon de compter les fréquences des mots

def search_v3(query, index):
    item_list = re.findall("[A-Za-z0-9]+", query)
    item_list2 = list(map(str.lower, item_list))
    res = set()
    final_Result = []
    for i in range(len(item_list2)):
        tmp = set(search_v1(item_list2[i], index))
        if i ==0:
            res = tmp.intersection(tmp)
        else :
            res = res.intersection(tmp)
    
    if len(res) > 0:
        list_pos = []
        isNear = []
        for doc_num in res:
            for item in item_list2:
                list_pos.append(getPosition(item, doc_num, index))
            min_occurence = min ([len(i) for i in list_pos])
            firstDocPos = list_pos[0]
            list_other_Docs = list_pos[1:]
            for position in firstDocPos:
                for other_Pos in list_other_Docs:
                    for p in other_Pos:
                        isNear.append(abs(p-position))
            if (1 in isNear):
                final_Result.append(doc_num)
    return final_Result


<h1 style="color:red">Ordered Queries</h1>

### Orderd queries by absolute frequencies

In [None]:
def search_v1_sorted(query, index):
    query = query.lower()
    
    try :
        tmp = list(index[query].items())
        tmp = sorted( list(map(lambda x:(x[0], x[1]["occurencies"]), tmp)), key=lambda a:a[1], reverse=True)
        return list(map(lambda x:x[0], tmp))
    
    except :
        print("No result for specified key!!!")
        return None

In [None]:
search_v1_sorted("china", newIndex)

In [None]:
liste_rep[1407], liste_rep[774], liste_rep[2342]

### Ordered queries by weighted frequencies

In [None]:
def search_v1_weighted(query, index):
    query = query.lower()
    
    try :
        tmp = list(index[query].items())
        tmp = sorted( list(map(lambda x:(x[0], x[1]["occurencies"]/x[1]["doc_size"]), tmp)),
                     key=lambda a:a[1], reverse=True)
        return list(map(lambda x:x[0], tmp))
    
    except :
        print("No result for specified key!!!")
        return None

In [None]:
search_v1_weighted("china", newIndex)
#newIndex["china"][871]

In [None]:
liste_rep[2409], liste_rep[2449], liste_rep[871]

### ordered queries weighted by tf_idf 

In [None]:
def search_v1_tf_idf(query, index):
    query = query.lower()
    
    try :
        tmp = list(index[query].items())
        N = len(tmp)
        D = len(index)
        tmp = sorted( list(map(lambda x:(x[0], 
                                         (x[1]["occurencies"]/x[1]["doc_size"])*np.log(D/(1+N)),
                                        ), tmp)),
                     key=lambda a:a[1], reverse=True)
        return list(map(lambda x:x[0], tmp))
    
    except :
        print("No result for specified key!!!")
        return None

In [None]:
search_v1_tf_idf("economy", newIndex) == search_v1_weighted("economy", newIndex)

<h1 style="color:red; text-align:center">Ordering for free Text queries</h1>

In [None]:
def search_v2_sorted(query, index):
    #item_list = re.findall("[a-z0-9]+", query)
    item_list = list(map(str.lower, re.findall("[a-z0-9]+", query)))
    res = []
    for item in item_list:
        tmp = search_v1(item, index)
        if tmp:
            res.extend(tmp)
    return list(set(res))

In [None]:
#newIndex
#len(newIndex)
len(list_vocab)
#clean_Docs[0]

<h1> Bag of Words</h1>

In [None]:
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
Corpus = list_vocab
v = DictVectorizer()

v.fit([OrderedDict.fromkeys(Corpus, 1)])
X = v.transform(Counter(f) for f in (clean_Docs))

#print(type(X))
#print(X.A)


In [None]:
print(X.A)

In [None]:
np.where(X.A[0]==1)

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

docs = [["hello", "world", "hello", "alka"], ["goodbye", "cruel", "world"]]
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in docs:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))

#csr_matrix((data, indices, indptr), dtype=int).toarray()

In [None]:
vocabulary, data, indptr