# INDEXATION WEB : Python rev

## Import des modules & global variables

In [477]:
from collections import Counter
import numpy as np
import re
import nltk
############################
# nltk.download("stopwords")##
# nltk.download("wordnet")
############################
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stops = set(stopwords.words("english"))
import os
import time
from collections import defaultdict
import pickle
import gzip
import operator 
from multiprocessing import Pool
import glob
from functools import reduce
import warnings
from sklearn.preprocessing import normalize
lem = WordNetLemmatizer()

In [5]:
#####
# nltk.download("stopwords")
# nltk.download("wordnet")
# lem = WordNetLemmatizer()

<h1 style="color:blue; text-align:center">1. TEXT DATA LOADING AND CLEANING</h1>

<h3 style="color:blue">First, get the list of paths to all data files and then read them</h3>

In [7]:
list_rep=glob.glob("data/**/*.txt",recursive=True)
def readText(path):
    with open(path, "r") as f:
        return f.read()

In [8]:
start = time.time()
with Pool() as pool:
    list_text = pool.map(readText, list_rep)
end = time.time()
print(end - start)

0.27991366386413574


In [9]:
#Loading data in a list
start=time.time()
list_text = []
for path in list_rep:
    with open(path, 'r') as f:
        text = f.read()
        list_text.append(text)
end=time.time()
end-start

0.05375790596008301

<h3 style="color:blue">Pre-processing</h3>

In [10]:
def cleanText(text, stopwords=stops, lem = lem):
    
    """Take a text file and put it in lower case
    then remove stopwords,
    and apply stemming to each word 
    """
    text = re.findall("[a-zA-z0-9]+", text.lower())
    return [lem.lemmatize(elt) for elt in text if elt not in stops]

In [11]:
start = time.time()
with Pool() as pool:
    list_processed_texts = pool.map(cleanText, list_text)
end = time.time()
end-start

3.7869677543640137

In [12]:
list_processed_texts[62]

['uk',
 'investment',
 'capital',
 'group',
 '3i',
 'group',
 'plc',
 'said',
 'thursday',
 'increase',
 'first',
 'half',
 'net',
 'asset',
 'value',
 'nav',
 'per',
 'share',
 'healthy',
 'partly',
 'held',
 'back',
 'continental',
 'europe',
 'strong',
 'pound',
 'pretty',
 'healthy',
 'increase',
 'held',
 'back',
 'bit',
 'continental',
 'europe',
 'market',
 'good',
 'also',
 'appreciation',
 'sterling',
 'ewen',
 'macpherson',
 '3i',
 'chief',
 'executive',
 'told',
 'reuters',
 '3i',
 'nav',
 'period',
 'rose',
 '454',
 'penny',
 'per',
 'share',
 '426',
 'penny',
 'period',
 'previous',
 'year',
 'macpherson',
 'said',
 'better',
 'indication',
 'group',
 'performance',
 'uk',
 'portfolio',
 'performed',
 'extremely',
 'well',
 'reflected',
 '18',
 'percent',
 'growth',
 'revenue',
 'said',
 'still',
 'plenty',
 'opportunity',
 'available',
 'sort',
 'company',
 '3i',
 'invested',
 'well',
 'also',
 'improvement',
 'general',
 'business',
 'confidence',
 'uk',
 'improved',
 'f

In [13]:
start = time.time()
list_processed_texts = list(map(cleanText, list_text))
end = time.time()
end-start
####Slower than map with pool

4.019930601119995

In [14]:
###Compare the length of list of original texts with list of processed texts
len(list_text), len(list_processed_texts)

(2500, 2500)

In [None]:
list_processed_texts[0]

<h3 style="color:blue">Building the Vocabulary </h3>

In [15]:
def vocabOneFile(termlist):
    """
        this function takes a list of words and return a dictionary
        with frequency for each word
    """
    return Counter(termlist)

def vocabGlobal(my_list):
    myCounter = Counter()
    for text in my_list:
        myCounter += Counter(text)
    return dict(myCounter)

In [16]:
start = time.time()
with Pool() as pool:
    vocabulary = pool.map(vocabOneFile, list_processed_texts)
vocabulary = dict(reduce(lambda x,y:x+y, vocabulary))
end = time.time()
end-start

19.856359243392944

In [18]:
start = time.time()
global_vocabulary = vocabGlobal(list_processed_texts)
end = time.time()
end-start
####faster than map with pool

1.7564656734466553

In [19]:
####Vocabulary size with the two methods
len(vocabulary), len(global_vocabulary)

(26058, 26058)

# Building the index

In [20]:
#input : doc = [word1, word2, ...]
#output = {word1: [pos1, pos2], word2: [pos2, pos434], ...}
def index_one_file(termlist):
    fileIndex = defaultdict(list)
    for index, word in enumerate(termlist):
        fileIndex[word].append(index)
    return dict(fileIndex)

In [31]:
#With map
def joindic(x,y): x.update(y); return x ###Fonction needed in the case of building index with the mapReduce technique
s=time.time()
with Pool(8) as pool:
    dic1=pool.map(index_one_file,list_processed_texts)
#dic1=reduce(joindic,dic1[1:],dic1[0]) ##Concatening the list of single dicts obtained from map step
dic1={k:dic1[k] for k in range(len(dic1))}
e=time.time()
e-s

1.0974690914154053

In [138]:
####Index with dictionary comprehension : two times Faster than mapReduce
e=time.time()
index={k:index_one_file(list_processed_texts[k]) for k in range(len(list_processed_texts))}
s=time.time()
s-e

0.22814726829528809

In [33]:
dic1[0]

{'0003': [217],
 '0840': [165],
 '10': [4],
 '110': [40],
 '13': [65],
 '14': [9],
 '15': [66, 102],
 '150': [90],
 '160': [91],
 '1995': [47],
 '2': [103, 176, 183, 215],
 '2423': [216],
 '26': [162, 168],
 '3': [128],
 '38': [127],
 '4': [114],
 '42': [214],
 '5': [5, 10, 110],
 '6': [113],
 '7': [41, 50],
 '80': [184],
 '87': [177],
 '9': [111],
 '939': [163],
 '945': [169],
 '95': [49],
 'accounting': [126],
 'added': [118, 211],
 'allowing': [141],
 'also': [147],
 'analyst': [22, 61, 132],
 'august': [12, 107, 117],
 'average': [63],
 'bank': [180],
 'basket': [175],
 'bear': [191],
 'benefit': [206],
 'better': [198],
 'billion': [6, 11, 42, 51, 67, 92],
 'buoyed': [13],
 'bureau': [27],
 'central': [179],
 'certainly': [209],
 'competitive': [143],
 'continued': [125],
 'crown': [7, 43, 52, 68, 93, 146, 160, 205],
 'csu': [28, 97],
 'currency': [174],
 'czech': [1, 25],
 'data': [29],
 'dealer': [203],
 'deficit': [3, 48, 58, 94, 196],
 'deviation': [172],
 'digested': [158],
 

In [139]:
index[0]

{'0003': [217],
 '0840': [165],
 '10': [4],
 '110': [40],
 '13': [65],
 '14': [9],
 '15': [66, 102],
 '150': [90],
 '160': [91],
 '1995': [47],
 '2': [103, 176, 183, 215],
 '2423': [216],
 '26': [162, 168],
 '3': [128],
 '38': [127],
 '4': [114],
 '42': [214],
 '5': [5, 10, 110],
 '6': [113],
 '7': [41, 50],
 '80': [184],
 '87': [177],
 '9': [111],
 '939': [163],
 '945': [169],
 '95': [49],
 'accounting': [126],
 'added': [118, 211],
 'allowing': [141],
 'also': [147],
 'analyst': [22, 61, 132],
 'august': [12, 107, 117],
 'average': [63],
 'bank': [180],
 'basket': [175],
 'bear': [191],
 'benefit': [206],
 'better': [198],
 'billion': [6, 11, 42, 51, 67, 92],
 'buoyed': [13],
 'bureau': [27],
 'central': [179],
 'certainly': [209],
 'competitive': [143],
 'continued': [125],
 'crown': [7, 43, 52, 68, 93, 146, 160, 205],
 'csu': [28, 97],
 'currency': [174],
 'czech': [1, 25],
 'data': [29],
 'dealer': [203],
 'deficit': [3, 48, 58, 94, 196],
 'deviation': [172],
 'digested': [158],
 

In [140]:
####Verifiy the length of the two indexes
len(index.keys()), len(dic1.keys())
#len(dic1)

(2500, 2500)

# Building the Inverted Index

In [36]:
#input = {doc_id: {word: [pos1, pos2, ...], ... }}
#res = {word1: {doc_id:{doc_size,[pos1, pos2],freq},....}, ..., ...}
def inverted_index(index):
    inv_index = defaultdict(dict)
    for doc_id in index.keys():
        doc_size=sum([len(list_pos) for list_pos in index[doc_id].values()]) #computes doc_size from list of positions in indexe
        for word in index[doc_id].keys():
#             inv_index[word][doc_id]={'doc_size':len(index[doc_id]),
#                               'positions':index[doc_id][word],
#                               'occurencies':len(index[doc_id][word])}
            inv_index[word][doc_id]={'doc_size':doc_size,
                              'positions':index[doc_id][word],
                              'occurencies':len(index[doc_id][word])}
    return inv_index

In [141]:
t=0
s=time.time()
for i in range(10): 
    inv_index=inverted_index(index)
e=time.time()
(e-s)/10

0.7802499055862426

In [38]:
inv_index["china"]

{62: {'doc_size': 279, 'occurencies': 1, 'positions': [272]},
 97: {'doc_size': 267, 'occurencies': 1, 'positions': [263]},
 150: {'doc_size': 341,
  'occurencies': 9,
  'positions': [0, 13, 28, 172, 198, 259, 277, 290, 306]},
 151: {'doc_size': 309,
  'occurencies': 7,
  'positions': [16, 48, 60, 80, 108, 119, 198]},
 152: {'doc_size': 365,
  'occurencies': 6,
  'positions': [0, 51, 86, 110, 135, 265]},
 153: {'doc_size': 375,
  'occurencies': 8,
  'positions': [26, 37, 69, 143, 159, 180, 189, 290]},
 154: {'doc_size': 358, 'occurencies': 3, 'positions': [0, 271, 346]},
 155: {'doc_size': 351,
  'occurencies': 7,
  'positions': [64, 84, 222, 242, 256, 291, 307]},
 156: {'doc_size': 309, 'occurencies': 2, 'positions': [0, 45]},
 157: {'doc_size': 267, 'occurencies': 5, 'positions': [7, 58, 92, 136, 204]},
 158: {'doc_size': 292,
  'occurencies': 7,
  'positions': [0, 93, 123, 174, 243, 250, 260]},
 159: {'doc_size': 373,
  'occurencies': 8,
  'positions': [0, 77, 108, 179, 199, 223, 26

In [41]:
list_processed_texts[62].count("china")
len(list_processed_texts[62])

279

In [43]:
##create index from list of list_processed_texts
def create_index(data):
    index = defaultdict(list)
    res={}
    
    for i, words in enumerate(data):
        for word in words:
            index[word].append(i)
    index_new = {}
    for word in index.keys():
        index_new[word] = dict( Counter( index[word] ) )
        for doc_num in index_new[word].keys():
            pos = np.array((np.where(np.array(data[doc_num])==word))).tolist()
            index_new[word][doc_num] = {"occurencies":index_new[word][doc_num], "positions":pos[0], "doc_size":len(data[doc_num])}
        res[word] = index_new[word]
            
    return res

In [44]:
start = time.time()
test_Index = create_index(list_processed_texts)
end = time.time()
end-start
#####Slower than using the normal index

16.567843675613403

# Serialize data with gzip

In [47]:
#store the object
with gzip.open("Index_Articles.pklz", "wb") as fout:
    pickle.dump(inv_index,fout)
    

#restore the object
with gzip.open("Index_Articles.pklz", "rb") as fin:
    indexe_pkl= pickle.load(fin)

In [48]:
indexe_pkl["china"]

{62: {'doc_size': 279, 'occurencies': 1, 'positions': [272]},
 97: {'doc_size': 267, 'occurencies': 1, 'positions': [263]},
 150: {'doc_size': 341,
  'occurencies': 9,
  'positions': [0, 13, 28, 172, 198, 259, 277, 290, 306]},
 151: {'doc_size': 309,
  'occurencies': 7,
  'positions': [16, 48, 60, 80, 108, 119, 198]},
 152: {'doc_size': 365,
  'occurencies': 6,
  'positions': [0, 51, 86, 110, 135, 265]},
 153: {'doc_size': 375,
  'occurencies': 8,
  'positions': [26, 37, 69, 143, 159, 180, 189, 290]},
 154: {'doc_size': 358, 'occurencies': 3, 'positions': [0, 271, 346]},
 155: {'doc_size': 351,
  'occurencies': 7,
  'positions': [64, 84, 222, 242, 256, 291, 307]},
 156: {'doc_size': 309, 'occurencies': 2, 'positions': [0, 45]},
 157: {'doc_size': 267, 'occurencies': 5, 'positions': [7, 58, 92, 136, 204]},
 158: {'doc_size': 292,
  'occurencies': 7,
  'positions': [0, 93, 123, 174, 243, 250, 260]},
 159: {'doc_size': 373,
  'occurencies': 8,
  'positions': [0, 77, 108, 179, 199, 223, 26

# Create Search functions

# Quering
## Single-word Queries - Function *sing_woq(ind,word)*
In which documents does a given word occur?
To do that, we loop through the item in the word's information and catch all values for tag *id_doc*. These values append the returned list wich is initialized to an empty list. If the word doesn't exist in the indexe, we return [].

In [68]:
def sing_woq(query, index):
    query = cleanText(query)
    if len(query)>0 and query[0] in index.keys():
        return list(index[query[0]].keys())
    else:
        #print("No result for specified key!!!")
        return None

In [69]:
query1 = sing_woq("tournament", indexe_pkl)
query1

[5, 16, 17, 22, 25, 26, 29, 37, 42, 846, 924, 1005, 1020, 1930, 1945]

In [67]:
sing_woq("the", indexe_pkl)

## Free-text Queries
* Which documents contain at least one word from a given list of words?

We use the previous founction on the list of words containing the text cleaned from stopwords. The returned object is the union of several results of the previous functions.

In [52]:
def free_tq(query, index):
    list_words=query.split()
    res = [id_doc for word in list_words if sing_woq(word, index) for id_doc in sing_woq(word, index) ]
    if len(res)>0:
        return sorted(list(set(res)))
    else:
        print("No match found!!!")
        return None
    
    

In [53]:
free_tq("tournament ", inv_index)==sing_woq("tournament",inv_index)

True

In [54]:
free_tq("The british tournament ", inv_index)

[1,
 5,
 16,
 17,
 22,
 25,
 26,
 29,
 37,
 42,
 50,
 51,
 52,
 53,
 55,
 56,
 57,
 60,
 61,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 73,
 75,
 76,
 77,
 78,
 79,
 82,
 84,
 85,
 86,
 87,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 99,
 100,
 103,
 104,
 105,
 106,
 108,
 110,
 111,
 112,
 115,
 117,
 118,
 119,
 125,
 126,
 127,
 129,
 131,
 134,
 135,
 136,
 138,
 139,
 144,
 146,
 147,
 149,
 150,
 151,
 168,
 169,
 213,
 215,
 217,
 230,
 239,
 243,
 254,
 256,
 257,
 272,
 334,
 401,
 404,
 405,
 406,
 409,
 410,
 411,
 414,
 415,
 416,
 417,
 419,
 421,
 422,
 425,
 427,
 428,
 430,
 432,
 434,
 438,
 447,
 450,
 451,
 453,
 454,
 455,
 456,
 457,
 458,
 459,
 460,
 461,
 463,
 465,
 466,
 467,
 468,
 469,
 471,
 472,
 473,
 474,
 475,
 476,
 477,
 478,
 479,
 480,
 481,
 482,
 483,
 484,
 486,
 487,
 488,
 490,
 491,
 493,
 494,
 495,
 496,
 497,
 498,
 499,
 503,
 504,
 520,
 522,
 524,
 526,
 528,
 531,
 532,
 535,
 537,
 538,
 542,
 543,
 544,
 548,
 554,
 556,
 557,
 565,
 566,

## Phrase Queries
* Which documents contain a given phrase in the same order?


1. We're looking for the texts belonging to the intersection of sing_woq for every word in the query text.
2. The we check whether they are in correct order or not.

In [55]:
def phrase_query(query, index):
    list_words = cleanText(query) #cleaning query
    res = set()
    final_result = []

    l=[set(sing_woq(word,index)) for word in list_words]
    ids=list(set.intersection(*l)) 
        
    #we use set because list does't recognize intersection as method
    #ids=list(set.instersection(*l)) #gets id of documents containing all words in the query
    
    
    #Check whether terms are in correct order
    if len(ids) > 0: # If there is at least one document conatins all words in the query
        
        for id_doc in ids: # getting positions of words in documents
            posMinusOne = []
            list_pos=[index[word][id_doc]["positions"] for word in list_words] 
#             for word in list_words:
#                 list_pos=[item[id_doc]["positions"] for item in index[word]] 
            for i, lpos in enumerate(list_pos) : #Check whether terms are in correct order
                posMinusOne.append( set([x-i for x in lpos]) )
                
            if (len(set.intersection(*posMinusOne)) > 0):
                final_result.append(id_doc)
                
    return final_result


In [56]:
query3 = phrase_query("who will head the enlarged group.\nAdvance Bank shareholders would be offered a combination of A$2.10 in cash, a 20 cent special cash dividend and new St George shares up to a value of A$5.00", inv_index)
query3

[533]

In [58]:
list_text[533]

'Australian regional banks St George Bank Ltd and Advance Bank Australia Ltd on Monday unveiled a merger plan to create the nation\'s fifth largest bank with a market value of A$4.5 billion (US$3.55 billion).\n"We have that golden opportunity, we are not going to miss it," St George Bank managing director Jim Sweeney told reporters at a joint news conference. This is the fifth merger St George has tried with different banks in the past two years.\n"We are going to build a special new different bank here that will take advantage of all of things that many of us always want to do," said Sweeney, who will head the enlarged group.\nAdvance Bank shareholders would be offered a combination of A$2.10 in cash, a 20 cent special cash dividend and new St George shares up to a value of A$5.00 per Advance Bank share.\nThe total value of the offer was A$7.30 per Advance share, which values it at A$2.65 billion. St George would fund the offer by a mixture of cash and scrip and a new capital raising.

<h1 style="color:red; text-align:center">Ordering Queries</h1>

### Ordering single word queries by term frequency in the documents

In [105]:
def sing_wq_TF(query, index):
    query = cleanText(query)
    if len(query)>0 and len(list(index[query[0]].keys())): ##Check if the query is contained in the index
        tmp = list(index[query[0]].items()) ###Retrieve list of tuples (doc_Id, {docsize, occurencies, positions})
        ##sort this list of tuple by descending occurencies of the word in the doc
        tmp = sorted( list(map(lambda x:(x[0], x[1]["occurencies"]), tmp)), key=lambda a:a[1], reverse=True)
        ##Finally return the list of ordered doc_id's
        return list(map(lambda x:x[0], tmp))
    else :
        print("No result for specified key!!!")
        return None
sing_wq_TF("french", inv_index)

[1025,
 1002,
 1030,
 1040,
 1044,
 1429,
 1434,
 1003,
 1008,
 1014,
 1017,
 1026,
 1031,
 1041,
 1433,
 1022,
 1032,
 1033,
 1036,
 1037,
 140,
 1000,
 1012,
 1013,
 1015,
 1018,
 1027,
 1048,
 1049,
 1411,
 1417,
 1425,
 1449,
 557,
 573,
 1028,
 1038,
 1046,
 1410,
 1413,
 1418,
 1421,
 1423,
 1430,
 1431,
 1443,
 577,
 599,
 1005,
 1009,
 1010,
 1020,
 1034,
 1400,
 1406,
 1407,
 1415,
 1420,
 1424,
 1432,
 1439,
 1445,
 1447,
 133,
 143,
 148,
 565,
 584,
 1011,
 1016,
 1019,
 1021,
 1023,
 1024,
 1029,
 1039,
 1045,
 1047,
 1402,
 1404,
 1408,
 1412,
 1422,
 1428,
 1442,
 1444,
 1446,
 1448,
 1450,
 1459,
 2095,
 2256,
 2278,
 2287,
 72,
 144,
 216,
 221,
 462,
 465,
 490,
 495,
 496,
 536,
 556,
 572,
 576,
 580,
 597,
 660,
 672,
 673,
 685,
 699,
 765,
 812,
 955,
 1004,
 1006,
 1035,
 1042,
 1043,
 1054,
 1161,
 1181,
 1196,
 1341,
 1347,
 1375,
 1389,
 1401,
 1403,
 1405,
 1416,
 1426,
 1427,
 1435,
 1436,
 1437,
 1438,
 1440,
 1463,
 1480,
 1564,
 1578,
 1590,
 1592,
 1681

In [107]:
t1=[]
for i in range(100):
    s=time.time()
    sing_wq_TF("french", inv_index)
    e=time.time()
    t1.append(e-s)
t1=sum(t1)/100
t1

9.291410446166992e-05

In [106]:
def sing_woq_TF(ind,query):
    query = cleanText(query)
    if len(query)>0 and query[0] in ind.keys():
        word=query[0]
        d={doc:ind[word][doc]["occurencies"] for doc in ind[word].keys()} # A dict containing the document id as key and the frequency of the word in the document
        
        sorted_d = sorted(d.items(), key=operator.itemgetter(1),reverse=True) #Sorted list of tuples (doc_ids,frequency) in descending order
        return sorted_d
    
    else:
        return []

sing_woq_TF(inv_index,"french")

[(1025, 13),
 (1002, 11),
 (1030, 10),
 (1040, 10),
 (1044, 9),
 (1429, 8),
 (1434, 8),
 (1003, 7),
 (1008, 7),
 (1014, 7),
 (1017, 7),
 (1026, 7),
 (1031, 7),
 (1041, 7),
 (1433, 7),
 (1022, 6),
 (1032, 6),
 (1033, 6),
 (1036, 6),
 (1037, 6),
 (140, 5),
 (1000, 5),
 (1012, 5),
 (1013, 5),
 (1015, 5),
 (1018, 5),
 (1027, 5),
 (1048, 5),
 (1049, 5),
 (1411, 5),
 (1417, 5),
 (1425, 5),
 (1449, 5),
 (557, 4),
 (573, 4),
 (1028, 4),
 (1038, 4),
 (1046, 4),
 (1410, 4),
 (1413, 4),
 (1418, 4),
 (1421, 4),
 (1423, 4),
 (1430, 4),
 (1431, 4),
 (1443, 4),
 (577, 3),
 (599, 3),
 (1005, 3),
 (1009, 3),
 (1010, 3),
 (1020, 3),
 (1034, 3),
 (1400, 3),
 (1406, 3),
 (1407, 3),
 (1415, 3),
 (1420, 3),
 (1424, 3),
 (1432, 3),
 (1439, 3),
 (1445, 3),
 (1447, 3),
 (133, 2),
 (143, 2),
 (148, 2),
 (565, 2),
 (584, 2),
 (1011, 2),
 (1016, 2),
 (1019, 2),
 (1021, 2),
 (1023, 2),
 (1024, 2),
 (1029, 2),
 (1039, 2),
 (1045, 2),
 (1047, 2),
 (1402, 2),
 (1404, 2),
 (1408, 2),
 (1412, 2),
 (1422, 2),
 (1428, 2)

In [102]:
t2=[]
for i in range(100):
    s=time.time()
    sing_woq_TF(inv_index,"french")
    e=time.time()
    t2.append(e-s)
t2=sum(t2)/100

In [104]:
t1/t2

1.1847635157114047

In [80]:
l=sing_woq_TF(inv_index,"french")
[(x,y/20) for (x,y) in l]

True

In [86]:
####Check the validity of the query output
list_text[1407].count("China"), list_text[774].count("China"), list_text[2342].count("China")

(26, 24, 21)

### Ordering single word queries by weighted term frequency (normalized by the doc size)

In [110]:
def sing_wq_TF_weighted(query, index):
    query = cleanText(query) ##Pre-process the query
    
    if len(query)>0 and len(list(index[query[0]].keys())): ##Check if the query is contained in the index
        tmp = list(index[query[0]].items())
        tmp = sorted( list(map(lambda x:(x[0], x[1]["occurencies"]/x[1]["doc_size"]), tmp)),
                     key=lambda a:a[1], reverse=True) ###The same processing as in the absolute frequency query
                                                    ###but here we just add the division by the doc_size to normalize
        return list(map(lambda x:x[0], tmp))
    
    else:
        print("No result for specified key!!!")
        return None

In [111]:
def sing_woq_TF_weighted(ind,query):
    query = cleanText(query)
    if len(query)>0 and query[0] in ind.keys():
        word=query[0]
        d={doc:ind[word][doc]["occurencies"]/ind[word][doc]["doc_size"] for doc in ind[word].keys()} # A dict containing the document id as key and the frequency of the word in the document
        import operator 
        sorted_d = sorted(d.items(), key=operator.itemgetter(1),reverse=True) #Sorted list of tuples (doc_ids,frequency) in descending order
        return sorted_d
    
    else:
        print("No result for specified key!!!")
        return None

sing_woq_TF_weighted(inv_index,"french")

[(1025, 0.036312849162011177),
 (1002, 0.03571428571428571),
 (1030, 0.033112582781456956),
 (1040, 0.028735632183908046),
 (1022, 0.02702702702702703),
 (1044, 0.023872679045092837),
 (1434, 0.023255813953488372),
 (1033, 0.022813688212927757),
 (1014, 0.022653721682847898),
 (1429, 0.022284122562674095),
 (1008, 0.021604938271604937),
 (1026, 0.020588235294117647),
 (1041, 0.02005730659025788),
 (1037, 0.019672131147540985),
 (1028, 0.01904761904761905),
 (1433, 0.019021739130434784),
 (1017, 0.018867924528301886),
 (1031, 0.018666666666666668),
 (1003, 0.018617021276595744),
 (1027, 0.017123287671232876),
 (1013, 0.017064846416382253),
 (1000, 0.016835016835016835),
 (140, 0.01644736842105263),
 (1032, 0.01643835616438356),
 (1036, 0.015915119363395226),
 (1411, 0.015197568389057751),
 (1417, 0.015060240963855422),
 (1048, 0.015015015015015015),
 (1425, 0.014450867052023121),
 (1046, 0.014388489208633094),
 (1049, 0.014367816091954023),
 (1015, 0.014124293785310734),
 (1038, 0.01360

In [112]:
((list_processed_texts[1025].count("french")/len(list_processed_texts[1025])),
(list_processed_texts[1002].count("french")/len(list_processed_texts[1002])),
((list_processed_texts[1030].count("french")/len(list_processed_texts[1030]))))

(0.036312849162011177, 0.03571428571428571, 0.033112582781456956)

### ordered queries weighted by tf_idf 

In [115]:
def sing_wq_tf_idf(query, index):
    query = cleanText(query)
    
    if len(query)>0 and len(list(index[query[0]].keys())): ##Check if the query is contained in the index
        tmp = list(index[query[0]].items())
        N = len(tmp) ### Number of documents in which the query appears
        D = len(index) ##size of our index
        tmp = sorted( list(map(lambda x:(x[0], 
                                         (x[1]["occurencies"]/x[1]["doc_size"])*np.log(D/(1+N)),
                                        ), tmp)),
                     key=lambda a:a[1], reverse=True)
        return list(tmp)
    
    else :
        print("No result for specified key!!!")
        return None

In [116]:
sing_wq_tf_idf("economy", inv_index)

[(427, 0.092865052631146416),
 (756, 0.071893983647055223),
 (1785, 0.070586820308017856),
 (1153, 0.069274797625712689),
 (1166, 0.067911518080600267),
 (421, 0.067193223177824696),
 (404, 0.065309301032652034),
 (657, 0.065157064899708794),
 (401, 0.063993545883642561),
 (158, 0.063818221100399711),
 (159, 0.062449465688058696),
 (175, 0.062449465688058696),
 (244, 0.060112646971989404),
 (883, 0.060112646971989404),
 (2400, 0.059536487416347331),
 (2487, 0.054808589886225632),
 (2465, 0.054488071816715541),
 (868, 0.050638371090534554),
 (289, 0.049737332459030378),
 (2454, 0.047376916681313677),
 (1144, 0.047216859530363302),
 (418, 0.046899967855662868),
 (2429, 0.042740643489258524),
 (284, 0.042480821948290379),
 (687, 0.040985895662720054),
 (1110, 0.040161466726975681),
 (1928, 0.039547794060519344),
 (294, 0.038985189458821574),
 (764, 0.038715208922403148),
 (2470, 0.038608260831457283),
 (2460, 0.038501901986191564),
 (2490, 0.038290932660239821),
 (2005, 0.0378758547994242

<h1 style="color:red; text-align:center">Ordering for free Text queries</h1>

In [149]:
from scipy.sparse import csr_matrix
corpus=sorted(inv_index.keys()) #get words from texts in order 
corpus[-23:] #the last 23
corpus.index("zyrtec")
n,m=len(list_rep),len(corpus) #dimensions of the matrix

In [190]:
data,row,col=[],[],[]
for i in range(n): #loop through the rows (doc id)
    for word in index[i].keys():
        j=corpus.index(word)
        if j> m:
            print(word)
        data.append(len(index[i][word]))
        row.append(i)
        col.append(j)
   

In [360]:
def pre_sparse(doc_id=None,text=None):
    data = []; row = []; col = []
    
    if doc_id and not text:
        for word in index[doc_id].keys():
            j=corpus.index(word)
            data.append(len(index[doc_id][word]))
            row.append(doc_id)
            col.append(j)
        return [data, row, col]
    if text and not doc_id and len(cleanText(text))>0:
        text=cleanText(text)
        for word in text:
            if word in corpus:
                j=corpus.index(word)
                data.append(text.count(word))
                col.append(j)
                row.append(0)   
        return [data, row, col]
    if (doc_id and text):
        warnings.warn("This function requires one argument")
        return None
    if not doc_id and not text:
        warnings.warn("This function requires one argument")
        return None

In [376]:
def pre_sparse_doc(doc_id):
    data = []; row = []; col = []
    for word in index[doc_id].keys():
        j=corpus.index(word)
        data.append(len(index[doc_id][word]))
        row.append(doc_id)
        col.append(j)
    return [data, row, col]



def pre_sparse_text(text):
    data = []; row = []; col = []
    
    if text and len(cleanText(text))>0:
        text=cleanText(text)
        for word in text:
            if word in corpus:
                j=corpus.index(word)
                data.append(text.count(word))
                col.append(j)
                row.append(0)   
        return [data, row, col]
    else:
        warnings.warn("Review your text")
        return None


In [382]:
# csr_matrix(([1,2],([0,0],[2,3]))).todense()
pre_sparse_doc(doc_id=245)
pre_sparse_text("I'm british")

[[1], [0], [4345]]

In [434]:
def sparse(text=None,nrow=len(list_rep),ncol=len(corpus)): 
    #if thext is empty the function return sparse matrix for all documents and the corpus
    #nrow=0 if sparsing vector else number of documents
    if text:
        pre_sps=pre_sparse_text(text=text)
        res=csr_matrix((pre_sps[0], (pre_sps[1], pre_sps[2])),shape=(1,ncol))
        return res, normalize(res,norm="l1",axis=1) #normalize is from sklearn
    
    else:
        with Pool(8) as pool:
             data_and_pos= pool.map(pre_sparse_doc,list(range(nrow)))
        add = lambda x,y:x+y
        data= reduce(add,[item[0] for item in data_and_pos],[])
        rows= reduce(add,[item[1] for item in data_and_pos],[])
        cols= reduce(add,[item[2] for item in data_and_pos],[])
        res= csr_matrix((data, (rows, col)),shape=(nrow,ncol))
        return res, normalize(res,norm="l1",axis=1) #normalize is from sklearn

In [435]:
s1  =time.time()
spm,spm_normalized=sparse()
e1 = time.time()
e1-s1
# list(zip(list(range(21)),repeat(None)))

85.85158467292786

In [388]:
inv_index["american"]

{16: {'doc_size': 282, 'occurencies': 1, 'positions': [180]},
 17: {'doc_size': 222, 'occurencies': 2, 'positions': [125, 170]},
 38: {'doc_size': 174, 'occurencies': 1, 'positions': [126]},
 42: {'doc_size': 292, 'occurencies': 1, 'positions': [194]},
 83: {'doc_size': 357, 'occurencies': 1, 'positions': [345]},
 98: {'doc_size': 364, 'occurencies': 1, 'positions': [354]},
 106: {'doc_size': 247, 'occurencies': 1, 'positions': [27]},
 110: {'doc_size': 242, 'occurencies': 1, 'positions': [121]},
 144: {'doc_size': 215, 'occurencies': 1, 'positions': [124]},
 146: {'doc_size': 213, 'occurencies': 1, 'positions': [122]},
 153: {'doc_size': 375, 'occurencies': 1, 'positions': [320]},
 167: {'doc_size': 350, 'occurencies': 3, 'positions': [84, 215, 224]},
 178: {'doc_size': 315, 'occurencies': 4, 'positions': [34, 150, 159, 246]},
 179: {'doc_size': 404, 'occurencies': 1, 'positions': [219]},
 211: {'doc_size': 284, 'occurencies': 1, 'positions': [141]},
 230: {'doc_size': 318, 'occurenci

In [445]:
list_text[55][25:200]

'eutsche Morgan Grenfell of any managers found to bear responsibility for failing to spot irregular dealings by former fund manager Peter Young is expected next week, banking s'

In [449]:
spmvec=sparse(text=list_text[55][25:200])[1]
spmvec.todense()[0,[corpus.index(word) for word in list_text[55][25:200].split() if word in corpus]]

matrix([[ 0.04761905,  0.04761905,  0.04761905,  0.04761905,  0.04761905,
          0.04761905,  0.04761905,  0.04761905,  0.19047619,  0.04761905,
          0.04761905,  0.04761905]])

In [201]:
spm.todense()[140,corpus.index("french")]

5

In [421]:
spm_normalized.todense()[702,corpus.index("american")]

0.026156941649899398

In [423]:
spm_normalized.todense()[:,0].sum()

0.99999999999999989

# Serialize the sparse matrces

In [425]:
#store the object
with gzip.open("spm.pklz", "wb") as fout:
    pickle.dump(spm,fout)
with gzip.open("spm_normalized.pklz", "wb") as fout:
    pickle.dump(spm_normalized,fout)
    

#restore the object
with gzip.open("spm_normalized.pklz", "rb") as fin:
    spm_norm_pkl= pickle.load(fin)

In [426]:
spm_norm_pkl.todense()[:,0].sum()

0.99999999999999989

# Dot product ordering documents

In [483]:
dotp=spm_normalized.dot(spmvec.transpose()).toarray()


In [484]:
d={k:dotp[k] for k in range(dotp.shape[0])}

In [485]:
sorted_d = sorted(d.items(), key=operator.itemgetter(1),reverse=True)
sorted_d[:10]

[(436, array([ 0.00835623])),
 (55, array([ 0.00824774])),
 (113, array([ 0.00771159])),
 (444, array([ 0.00613497])),
 (268, array([ 0.0058072])),
 (431, array([ 0.00568586])),
 (446, array([ 0.00551572])),
 (432, array([ 0.00532387])),
 (768, array([ 0.0052381])),
 (770, array([ 0.0052381]))]

In [490]:
list_text[55][25:200]
# lem.lemmatize("Management")

'eutsche Morgan Grenfell of any managers found to bear responsibility for failing to spot irregular dealings by former fund manager Peter Young is expected next week, banking s'

In [486]:
list_text[436]

'Hong Kong funds are expected to erect "Chinese walls" between asset management and traders after revelations of unsanctioned trades at the colony\'s biggest fund manager, Jardine Fleming Investment Management.\nCentral dealing, a system that prevents fund managers from executing their own trades, is gaining favour in Asia after last week\'s shocking disclosure of late allocation of trades by one of Hong Kong\'s most prominent fund managers, Colin Armstrong.\n"There is an awareness in the pension (fund) community that this is an issue in Hong Kong," said Gregory Neumann, executive director at Scudder Stevens &amp; Clark Asia Ltd. "I think this could turn out to be a positive for the investment management industry -- no-one will get hired going forward without central dealing."\nRegulators unveiled severe punishment last week for Armstrong\'s actions, which involved delaying the allocation of his trades until the price had changed. Some of the deals involved his own personal trading acc

In [None]:
def search_v2_sorted(query, index):
    #item_list = re.findall("[a-z0-9]+", query)
    item_list = list(map(str.lower, re.findall("[a-z0-9]+", query)))
    res = []
    for item in item_list:
        tmp = search_v1(item, index)
        if tmp:
            res.extend(tmp)
    return list(set(res))

<h1> Bag of Words</h1>

In [None]:
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
Corpus = list_vocab
v = DictVectorizer()

v.fit([OrderedDict.fromkeys(Corpus, 1)])
X = v.transform(Counter(f) for f in (clean_Docs))

#print(type(X))
#print(X.A)


In [123]:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
spm=csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
spm

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]], dtype=int64)

In [124]:
spm[0,2]

2

In [None]:
print(X.A)

In [None]:
np.where(X.A[0]==1)

In [118]:
import numpy as np
from scipy.sparse import csr_matrix

docs = [["hello", "world", "hello", "alka"], ["goodbye", "cruel", "world"]]
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in docs:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))

#csr_matrix((data, indices, indptr), dtype=int).toarray()

In [None]:
vocabulary, data, indptr