# INDEXATION WEB : Python rev

## Import des modules

In [2]:
from collections import Counter
import numpy as np
import re
import nltk
############################
nltk.download("stopwords")##
nltk.download("wordnet")####
lem = WordNetLemmatizer()###
############################
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stops = set(stopwords.words("english"))
import os
import time
from collections import defaultdict
import pickle
import gzip
from multiprocessing import Pool
import glob
from functools import reduce

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
#####
# nltk.download("stopwords")
# nltk.download("wordnet")
# lem = WordNetLemmatizer()

<h1 style="color:blue; text-align:center">1. TEXT DATA LOADING AND CLEANING</h1>

<h3 style="color:blue">First, get the list of paths to all data files and then read them</h3>

In [3]:
list_rep=glob.glob("data/**/*.txt",recursive=True)
def readText(path):
    with open(path, "r") as f:
        return f.read()

In [6]:
start = time.time()
with Pool() as pool:
    list_text = pool.map(readText, list_rep)
end = time.time()
print(end - start)

0.13456177711486816


In [7]:
#Loading data in a list
start=time.time()
list_text = []
for path in list_rep:
    with open(path, 'r') as f:
        text = f.read()
        list_text.append(text)
end=time.time()
end-start

0.05635857582092285

<h3 style="color:blue">Pre-processing</h3>

In [8]:
def cleanText(text, stopwords=stops, lem = lem):
    
    """Take a text file and put it in lower case
    then remove stopwords,
    and apply stemming to each word 
    """
    text = re.findall("[a-zA-z0-9]+", text.lower())
    return [lem.lemmatize(elt) for elt in text if elt not in stops]

In [10]:
start = time.time()
with Pool() as pool:
    list_processed_texts = pool.map(cleanText, list_text)
end = time.time()
end-start

7.7672319412231445

In [None]:
list_processed_texts[62]

In [11]:
start = time.time()
list_processed_texts = list(map(cleanText, list_text))
end = time.time()
end-start
####Slower than map with pool

4.3827290534973145

In [14]:
###Compare the length of list of original texts with list of processed texts
len(list_text), len(list_processed_texts)

(2500, 2500)

In [None]:
list_processed_texts[0]

<h3 style="color:blue">Building the Vocabulary </h3>

In [15]:
def vocabOneFile(termlist):
    """
        this function takes a list of words and return a dictionary
        with frequency for each word
    """
    return Counter(termlist)

def vocabGlobal(my_list):
    myCounter = Counter()
    for text in my_list:
        myCounter += Counter(text)
    return dict(myCounter)

In [16]:
start = time.time()
with Pool() as pool:
    vocabulary = pool.map(vocabOneFile, list_processed_texts)
vocabulary = dict(reduce(lambda x,y:x+y, vocabulary))
end = time.time()
end-start

19.79505729675293

In [17]:
start = time.time()
global_vocabulary = vocabGlobal(list_processed_texts)
end = time.time()
end-start
####faster than map with pool

1.7536182403564453

In [18]:
####Vocabulary size with the two methods
len(vocabulary), len(global_vocabulary)

(26058, 26058)

# Building the index

In [30]:
#input : doc = [word1, word2, ...]
#output = {word1: [pos1, pos2], word2: [pos2, pos434], ...}
def index_one_file(termlist):
    fileIndex = defaultdict(list)
    for index, word in enumerate(termlist):
        fileIndex[word].append(index)
    return dict(fileIndex)

In [46]:
#With map
def joindic(x,y): x.update(y); return x ###Fonction needed in the case of building index with the mapReduce technique
s=time.time()
with Pool(8) as pool:
    dic1=pool.map(index_one_file,list_processed_texts)
#dic1=reduce(joindic,dic1[1:],dic1[0]) ##Concatening the list of single dicts obtained from map step
e=time.time()
e-s

0.8226568698883057

In [32]:
####Index with dictionary comprehension : two times Faster than mapReduce
e=time.time()
dic={k:index_one_file(list_processed_texts[k]) for k in range(len(list_processed_texts))}
s=time.time()
s-e

0.6228201389312744

In [125]:
####Verifiy the length of the two indexes
#len(dic.keys()), len(dic1.keys())
#len(dic1)
list_text[0]

'Pall Corp and Memtec Ltd may have different reasons, but they seem equally determined to acquire Gelman Sciences Inc.\nAnalysts and arbitrageurs were reluctant to say whether that portends an all-out bidding war, especially after a rally in American Depositary Shares of Memtec on Thursday that reflected some doubt on Wall Street of Memtec\'s commitment.\nStill, Gelman would nicely complement the operations of either suitor and will likely be hotly pursued by both, filtration industry analysts said.  \nShares in Ann Arbor, Mich.-based Gelman closed up 3 at 29 on the American Stock Exchange on the news that it was in discussions with Pall about a possible acquisition.\nGelman reached a definitive agreement on August 30 to be acquired by Australia\'s Memtec, which said Thursday it intends to proceed with the agreement.\n"I don\'t think Memtec is going to back away," said Robert W. Baird &amp; Co analyst Walter Morris.\nMemtec wants Gelman\'s sophisticated membrane technology to put Memte

# Building the Inverted Index

In [106]:
#input = {doc_id: {word: [pos1, pos2, ...], ... }}
#res = {word1: {doc_id:{doc_size,[pos1, pos2],freq},....}, ..., ...}
def inverted_index(index):
    inv_index = defaultdict(dict)
    for doc_id in index.keys():
        for word in index[doc_id].keys():
#             inv_index[word][doc_id]={'doc_size':len(index[doc_id]),
#                               'positions':index[doc_id][word],
#                               'occurencies':len(index[doc_id][word])}
            inv_index[word][doc_id]={'doc_size':len(list_processed_texts[doc_id]),
                              'positions':index[doc_id][word],
                              'occurencies':len(index[doc_id][word])}
    return inv_index

In [105]:
t=0
s=time.time()
for i in range(10): 
    inv_index=inverted_index(dic)
e=time.time()
(e-s)/10

0.9438146114349365

In [107]:
inv_index["china"]

{57: {'doc_size': 409, 'occurencies': 3, 'positions': [213, 259, 264]},
 85: {'doc_size': 246, 'occurencies': 1, 'positions': [216]},
 91: {'doc_size': 411, 'occurencies': 1, 'positions': [219]},
 111: {'doc_size': 265,
  'occurencies': 6,
  'positions': [14, 67, 82, 112, 170, 198]},
 169: {'doc_size': 332, 'occurencies': 1, 'positions': [326]},
 177: {'doc_size': 332, 'occurencies': 1, 'positions': [326]},
 210: {'doc_size': 279, 'occurencies': 1, 'positions': [272]},
 243: {'doc_size': 267, 'occurencies': 1, 'positions': [263]},
 264: {'doc_size': 268, 'occurencies': 1, 'positions': [200]},
 278: {'doc_size': 364, 'occurencies': 1, 'positions': [88]},
 300: {'doc_size': 366, 'occurencies': 2, 'positions': [140, 152]},
 303: {'doc_size': 387, 'occurencies': 1, 'positions': [181]},
 307: {'doc_size': 331, 'occurencies': 1, 'positions': [90]},
 309: {'doc_size': 339,
  'occurencies': 8,
  'positions': [8, 39, 50, 97, 105, 119, 138, 197]},
 323: {'doc_size': 452, 'occurencies': 3, 'posit

In [59]:
list_processed_texts[57].count("china")

3

In [62]:
##create index from list of list_processed_texts
def create_index(data):
    index = defaultdict(list)
    res={}
    
    for i, words in enumerate(data):
        for word in words:
            index[word].append(i)
    index_new = {}
    for word in index.keys():
        index_new[word] = dict( Counter( index[word] ) )
        for doc_num in index_new[word].keys():
            pos = np.array((np.where(np.array(data[doc_num])==word))).tolist()
            index_new[word][doc_num] = {"occurencies":index_new[word][doc_num], "positions":pos[0], "doc_size":len(data[doc_num])}
        res[word] = index_new[word]
            
    return res

In [65]:
start = time.time()
test_Index = create_index(list_processed_texts)
end = time.time()
end-start
#####Slower than using the normal index

17.637397050857544

# Serialize data with gzip

In [108]:
#store the object
with gzip.open("Index_Articles.pklz", "wb") as fout:
    pickle.dump(inv_index,fout)
    

#restore the object
with gzip.open("Index_Articles.pklz", "rb") as fin:
    indexe_pkl= pickle.load(fin)

In [109]:
indexe_pkl["china"]

{57: {'doc_size': 409, 'occurencies': 3, 'positions': [213, 259, 264]},
 85: {'doc_size': 246, 'occurencies': 1, 'positions': [216]},
 91: {'doc_size': 411, 'occurencies': 1, 'positions': [219]},
 111: {'doc_size': 265,
  'occurencies': 6,
  'positions': [14, 67, 82, 112, 170, 198]},
 169: {'doc_size': 332, 'occurencies': 1, 'positions': [326]},
 177: {'doc_size': 332, 'occurencies': 1, 'positions': [326]},
 210: {'doc_size': 279, 'occurencies': 1, 'positions': [272]},
 243: {'doc_size': 267, 'occurencies': 1, 'positions': [263]},
 264: {'doc_size': 268, 'occurencies': 1, 'positions': [200]},
 278: {'doc_size': 364, 'occurencies': 1, 'positions': [88]},
 300: {'doc_size': 366, 'occurencies': 2, 'positions': [140, 152]},
 303: {'doc_size': 387, 'occurencies': 1, 'positions': [181]},
 307: {'doc_size': 331, 'occurencies': 1, 'positions': [90]},
 309: {'doc_size': 339,
  'occurencies': 8,
  'positions': [8, 39, 50, 97, 105, 119, 138, 197]},
 323: {'doc_size': 452, 'occurencies': 3, 'posit

# Create Search function

# Quering
## Single-word Queries - Function *sing_woq(ind,word)*
In which documents does a given word occur?
To do that, we loop through the item in the word's information and catch all values for tag *id_doc*. These values append the returned list wich is initialized to an empty list. If the word doesn't exist in the indexe, we return [].

In [68]:
def sing_woq(query, index):
    query = cleanText(query)
    if len(query)>0 and len(list(index[query[0]].keys())):
        return list(index[query[0]].keys())
    else:
        #print("No result for specified key!!!")
        return None

In [69]:
query1 = sing_woq("tournament", indexe_pkl)
query1

[492,
 650,
 660,
 662,
 665,
 666,
 667,
 671,
 674,
 698,
 1715,
 1967,
 1992,
 2171,
 2197]

In [73]:
sing_woq("the", indexe_pkl)

## Free-text Queries
* Which documents contain at least one word from a given list of words?

We use the previous founction on the list of words containing the text cleaned from stopwords. The returned object is the union of several results of the previous functions.

In [74]:
def free_tq(query, index):
    list_words=query.split()
    res = [id_doc for word in list_words if sing_woq(word, index) for id_doc in sing_woq(word, index) ]
    if len(res)>0:
        return sorted(list(set(res)))
    else:
        print("No match found!!!")
        return None
    
    

In [75]:
free_tq("tournament ", inv_index)==sing_woq("tournament",inv_index)

True

In [76]:
free_tq("The british tournament ", inv_index)

[2,
 7,
 34,
 52,
 55,
 56,
 57,
 58,
 59,
 60,
 62,
 63,
 66,
 67,
 68,
 72,
 73,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 86,
 88,
 90,
 93,
 94,
 96,
 97,
 151,
 154,
 155,
 156,
 157,
 160,
 163,
 169,
 170,
 174,
 175,
 176,
 177,
 178,
 179,
 181,
 182,
 183,
 186,
 189,
 190,
 191,
 193,
 194,
 196,
 199,
 201,
 202,
 203,
 204,
 205,
 206,
 209,
 211,
 212,
 213,
 215,
 216,
 217,
 218,
 219,
 221,
 222,
 224,
 225,
 226,
 227,
 228,
 233,
 234,
 235,
 237,
 238,
 239,
 240,
 241,
 243,
 244,
 245,
 246,
 247,
 248,
 249,
 250,
 251,
 254,
 255,
 257,
 258,
 260,
 261,
 263,
 264,
 265,
 266,
 267,
 268,
 270,
 272,
 273,
 274,
 275,
 276,
 278,
 280,
 283,
 284,
 285,
 289,
 290,
 291,
 293,
 294,
 295,
 296,
 298,
 299,
 300,
 312,
 323,
 331,
 340,
 343,
 349,
 354,
 401,
 402,
 403,
 406,
 408,
 410,
 412,
 413,
 414,
 416,
 417,
 418,
 419,
 421,
 423,
 425,
 426,
 429,
 432,
 433,
 434,
 436,
 437,
 439,
 440,
 441,
 442,
 445,
 450,
 451,
 452,
 453,
 455,
 456,
 45

## Phrase Queries
* Which documents contain a given phrase in the same order?


1. We're looking for the texts belonging to the intersection of sing_woq for every word in the query text.
2. The we check whether they are in correct order or not.

In [77]:
def phrase_query(query, index):
    list_words = cleanText(query) #cleaning query
    res = set()
    final_result = []

    l=[set(sing_woq(word,index)) for word in list_words]
    ids=list(set.intersection(*l)) 
        
    #we use set because list does't recognize intersection as method
    #ids=list(set.instersection(*l)) #gets id of documents containing all words in the query
    
    
    #Check whether terms are in correct order
    if len(ids) > 0: # If there is at least one document conatins all words in the query
        
        for id_doc in ids: # getting positions of words in documents
            posMinusOne = []
            list_pos=[index[word][id_doc]["positions"] for word in list_words] 
#             for word in list_words:
#                 list_pos=[item[id_doc]["positions"] for item in index[word]] 
            for i, lpos in enumerate(list_pos) : #Check whether terms are in correct order
                posMinusOne.append( set([x-i for x in lpos]) )
                
            if (len(set.intersection(*posMinusOne)) > 0):
                final_result.append(id_doc)
                
    return final_result


In [80]:
query3 = phrase_query("who will head the enlarged group.\nAdvance Bank shareholders would be offered a combination of A$2.10 in cash, a 20 cent special cash dividend and new St George shares up to a value of A$5.00", inv_index)
query3

[577]

In [79]:
list_text[577]

'Australian regional banks St George Bank Ltd and Advance Bank Australia Ltd on Monday unveiled a merger plan to create the nation\'s fifth largest bank with a market value of A$4.5 billion (US$3.55 billion).\n"We have that golden opportunity, we are not going to miss it," St George Bank managing director Jim Sweeney told reporters at a joint news conference. This is the fifth merger St George has tried with different banks in the past two years.\n"We are going to build a special new different bank here that will take advantage of all of things that many of us always want to do," said Sweeney, who will head the enlarged group.\nAdvance Bank shareholders would be offered a combination of A$2.10 in cash, a 20 cent special cash dividend and new St George shares up to a value of A$5.00 per Advance Bank share.\nThe total value of the offer was A$7.30 per Advance share, which values it at A$2.65 billion. St George would fund the offer by a mixture of cash and scrip and a new capital raising.

<h1 style="color:red; text-align:center">Ordering Queries</h1>

### Ordering single word queries by term frequency in the documents

In [81]:
def sing_wq_TF(query, index):
    query = cleanText(query)
    if len(query)>0 and len(list(index[query[0]].keys())): ##Check if the query is contained in the index
        tmp = list(index[query[0]].items()) ###Retrieve list of tuples (doc_Id, {docsize, occurencies, positions})
        ##sort this list of tuple by descending occurencies of the word in the doc
        tmp = sorted( list(map(lambda x:(x[0], x[1]["occurencies"]), tmp)), key=lambda a:a[1], reverse=True)
        ##Finally return the list of ordered doc_id's
        return list(map(lambda x:x[0], tmp))
    else :
        print("No result for specified key!!!")
        return None

In [82]:
sing_wq_TF("china", inv_index)

[1407,
 774,
 2342,
 2409,
 2449,
 871,
 1430,
 1447,
 1168,
 1762,
 799,
 1404,
 1751,
 1791,
 2061,
 2068,
 2445,
 729,
 775,
 1405,
 1406,
 1416,
 1428,
 1438,
 1443,
 1759,
 2428,
 454,
 863,
 890,
 1182,
 1421,
 1439,
 1444,
 1602,
 1620,
 1770,
 1795,
 1796,
 1799,
 2051,
 2422,
 462,
 464,
 1161,
 1169,
 1174,
 1194,
 1429,
 1449,
 1629,
 1766,
 1767,
 2060,
 2070,
 2311,
 2313,
 860,
 895,
 1170,
 1173,
 1186,
 1409,
 1418,
 1442,
 1600,
 1608,
 1610,
 1758,
 1772,
 1773,
 1784,
 1789,
 2052,
 2424,
 2431,
 452,
 474,
 740,
 771,
 797,
 850,
 866,
 872,
 886,
 888,
 889,
 1166,
 1175,
 1178,
 1189,
 1191,
 1199,
 1400,
 1417,
 1425,
 1426,
 1604,
 1630,
 1633,
 1753,
 1755,
 1756,
 1768,
 1798,
 2067,
 2079,
 2082,
 2317,
 2412,
 2415,
 2439,
 2441,
 2442,
 309,
 459,
 485,
 765,
 769,
 776,
 789,
 856,
 862,
 869,
 875,
 882,
 893,
 897,
 899,
 1155,
 1157,
 1180,
 1184,
 1408,
 1410,
 1423,
 1440,
 1613,
 1624,
 1754,
 1757,
 1765,
 1775,
 2085,
 2096,
 2332,
 2420,
 2429,
 4

In [86]:
####Check the validity of the query output
list_text[1407].count("China"), list_text[774].count("China"), list_text[2342].count("China")

(26, 24, 21)

### Ordering single word queries by weighted term frequency (normalized by the doc size)

In [110]:
def sing_wq_TF_weighted(query, index):
    query = cleanText(query) ##Pre-process the query
    
    if len(query)>0 and len(list(index[query[0]].keys())): ##Check if the query is contained in the index
        tmp = list(index[query[0]].items())
        tmp = sorted( list(map(lambda x:(x[0], x[1]["occurencies"]/x[1]["doc_size"]), tmp)),
                     key=lambda a:a[1], reverse=True) ###The same processing as in the absolute frequency query
                                                    ###but here we just add the division by the doc_size to normalize
        return list(map(lambda x:x[0], tmp))
    
    else :
        print("No result for specified key!!!")
        return None

In [111]:
sing_wq_TF_weighted("china", inv_index)

[2409,
 2449,
 871,
 740,
 454,
 1168,
 729,
 1406,
 1447,
 863,
 775,
 1762,
 799,
 2068,
 1182,
 1407,
 2428,
 1791,
 1602,
 1620,
 774,
 459,
 1194,
 1429,
 2445,
 1751,
 2422,
 2051,
 1169,
 1421,
 1439,
 872,
 1174,
 1633,
 1404,
 1430,
 2317,
 1770,
 1796,
 1405,
 2060,
 1187,
 2342,
 1184,
 2061,
 1191,
 2313,
 2070,
 1428,
 1155,
 1630,
 1189,
 1438,
 1443,
 1795,
 2052,
 1759,
 1170,
 1180,
 888,
 1157,
 481,
 462,
 474,
 1648,
 1629,
 1175,
 485,
 1418,
 1608,
 2431,
 1444,
 1610,
 1177,
 1186,
 890,
 1425,
 2442,
 1417,
 1799,
 797,
 895,
 1166,
 2311,
 869,
 1161,
 1183,
 1767,
 1625,
 2085,
 452,
 1449,
 1156,
 2424,
 1604,
 488,
 1640,
 2096,
 1789,
 2079,
 2067,
 852,
 771,
 1415,
 860,
 1426,
 1173,
 886,
 1772,
 850,
 495,
 856,
 1768,
 1758,
 457,
 889,
 1600,
 776,
 466,
 1402,
 789,
 866,
 1612,
 2412,
 1792,
 1624,
 1613,
 1153,
 498,
 2420,
 1798,
 1773,
 2153,
 2348,
 893,
 464,
 896,
 1400,
 2415,
 1784,
 2321,
 1757,
 1753,
 1408,
 2332,
 2082,
 1766,
 741,
 85

In [114]:
((list_processed_texts[2409].count("china")/len(list_processed_texts[2409])),
(list_processed_texts[2449].count("china")/len(list_processed_texts[2449])),
((list_processed_texts[871].count("china")/len(list_processed_texts[871]))))

(0.058823529411764705, 0.05847953216374269, 0.05714285714285714)

### ordered queries weighted by tf_idf 

In [118]:
def sing_wq_tf_idf(query, index):
    query = cleanText(query)
    
    if len(query)>0 and len(list(index[query[0]].keys())): ##Check if the query is contained in the index
        tmp = list(index[query[0]].items())
        N = len(tmp) ### Number of documents in which the query appears
        D = len(index) ##size of our index
        tmp = sorted( list(map(lambda x:(x[0], 
                                         (x[1]["occurencies"]/x[1]["doc_size"])*np.log(D/(1+N)),
                                        ), tmp)),
                     key=lambda a:a[1], reverse=True)
        return list(map(lambda x:x[0], tmp))
    
    else :
        print("No result for specified key!!!")
        return None

In [119]:
sing_wq_tf_idf("economy", inv_index)

[776,
 2145,
 732,
 37,
 15,
 769,
 765,
 325,
 779,
 853,
 875,
 897,
 1192,
 2089,
 2237,
 2409,
 2449,
 1158,
 2341,
 2442,
 1278,
 797,
 2223,
 2315,
 338,
 1272,
 2172,
 2331,
 2139,
 2408,
 2433,
 2436,
 606,
 1161,
 487,
 1270,
 744,
 1932,
 2115,
 2195,
 1078,
 2249,
 2494,
 2212,
 1085,
 520,
 774,
 1435,
 794,
 786,
 1318,
 2332,
 2319,
 597,
 585,
 643,
 2171,
 2197,
 96,
 711,
 668,
 550,
 640,
 2443,
 2240,
 1491,
 1256,
 2439,
 926,
 573,
 687,
 1347,
 2446,
 1940,
 599,
 482,
 1284,
 882,
 652,
 2358,
 602,
 560,
 83,
 1518,
 913,
 1020,
 2376,
 2175,
 1004,
 2317,
 1615,
 2186,
 1088,
 1056,
 2493,
 1153,
 322,
 1045,
 1936,
 2384,
 2085,
 472,
 499,
 1175,
 880,
 932,
 161,
 166,
 290,
 1031,
 1086,
 336,
 856,
 1074,
 2040,
 1311,
 2206,
 1169,
 1014,
 180,
 631,
 929,
 789,
 795,
 2342,
 1179,
 1917,
 1304,
 317,
 515,
 1346,
 2232,
 636,
 1602,
 1620,
 2369,
 1470,
 601,
 2146,
 871,
 276,
 893,
 1165,
 1925,
 2125,
 2422,
 780,
 1268,
 2343,
 491,
 771,
 1295,
 224

<h1 style="color:red; text-align:center">Ordering for free Text queries</h1>

In [122]:
voc = {}
voc.setdefault("a",1)
voc

{'a': 1}

In [None]:
def search_v2_sorted(query, index):
    #item_list = re.findall("[a-z0-9]+", query)
    item_list = list(map(str.lower, re.findall("[a-z0-9]+", query)))
    res = []
    for item in item_list:
        tmp = search_v1(item, index)
        if tmp:
            res.extend(tmp)
    return list(set(res))

<h1> Bag of Words</h1>

In [None]:
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
Corpus = list_vocab
v = DictVectorizer()

v.fit([OrderedDict.fromkeys(Corpus, 1)])
X = v.transform(Counter(f) for f in (clean_Docs))

#print(type(X))
#print(X.A)


In [None]:
print(X.A)

In [None]:
np.where(X.A[0]==1)

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

docs = [["hello", "world", "hello", "alka"], ["goodbye", "cruel", "world"]]
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in docs:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))

#csr_matrix((data, indices, indptr), dtype=int).toarray()

In [None]:
vocabulary, data, indptr